In [1]:
import xml.etree.ElementTree as ET

print("hello")
# allowed_columns = {'Id'}
input_files = ["users", "badges", "tags", "posts", "posttags", "comments"]


hello


### Save Chunked as small data

In [None]:
import xml.etree.ElementTree as ET

def process_xml_in_chunks(input_file, chunk_size=5, output_prefix="chunked_output", max_chunks=1, xml_file=""):
    """
    Process large XML file in chunks of elements and stop after a certain number of chunks.
    
    :param input_file: Path to the large XML file.
    :param chunk_size: Number of elements to process in each chunk.
    :param output_prefix: Prefix for output file names.
    :param max_chunks: Maximum number of chunks to process.
    """
    context = ET.iterparse(input_file, events=("start", "end"))
    # _, root = next(context)  # Get the root element
    
    chunk_count = 0
    elements_in_chunk = 0
    chunk_root = ET.Element(root.tag)  # Create a new root for each chunk

    for event, elem in context:
        if event == "end" and elem.tag != root.tag:
            # Add element to the current chunk
            chunk_root.append(elem)
            print(chunk_root)
            elements_in_chunk += 1

            # If we've reached the chunk size, save the chunk and reset
            if elements_in_chunk >= chunk_size:
                chunk_count += 1
                save_chunk(chunk_root, output_prefix, chunk_count, xml_file)
                elem.clear()
                
                # Clear the chunk and start over
                chunk_root = ET.Element(root.tag)  # Create a new root for the next chunk
                elements_in_chunk = 0

            # Clear the element from memory

            # Stop after processing the max number of chunks
            if chunk_count >= max_chunks:
                print(f"Reached maximum chunk count ({max_chunks}). Terminating.")
                break

    # If there are any remaining elements in the final chunk, save it
    if elements_in_chunk > 0 and chunk_count < max_chunks:
        chunk_count += 1
        save_chunk(chunk_root, output_prefix, chunk_count)

    print(f"Processed {chunk_count} chunks of {chunk_size} elements each.")

def save_chunk(chunk_root, output_prefix, chunk_count, xml_file):
    output_file = f"small_data/{xml_file}_{output_prefix}_{chunk_count}.xml"
    tree = ET.ElementTree(chunk_root)
    print(tree)
    tree.write(output_file, encoding="utf-8", xml_declaration=True)
    print(f"Saved chunk {chunk_count} to {output_file}")

# Usage example
input_files = ["Users", "Badges", "Posts", "PostLinks", "Comments", "PostHistory", "Votes", "Tags"]
for file in input_files:
    input_file = "askubuntu/" + file + ".xml"  # Path to your large XML file
    process_xml_in_chunks(input_file, chunk_size=100, max_chunks=1, xml_file=file)

### DB operation code

In [None]:
# !pip install psycopg2==2.9.9
# !pip install pyyaml==6.0.2

In [3]:
import psycopg2
import psycopg2.extras as extras
import yaml
import os

def connect():
    config = {}
    # yml_path = os.path.join(os.path.dirname(__file__), './db.yml')
    current_directory = os.getcwd()
    yml_path = os.path.join(current_directory, 'db.yml')
    with open(yml_path, 'r') as file:
        config = yaml.load(file, Loader=yaml.FullLoader)
    return psycopg2.connect(dbname=config['database'],
                            user=config['user'],
                            password=config['password'],
                            host=config['host'],
                            port=config['port'])

def exec_sql_file(path):
    # full_path = os.path.join(os.path.dirname(__file__), f'./{path}')
    current_directory = os.getcwd()
    full_path = os.path.join(current_directory, f'./{path}')
    conn = connect()
    cur = conn.cursor()
    with open(full_path, 'r') as file:
        cur.execute(file.read())
    conn.commit()
    conn.close()

def exec_get_one(sql, args={}):
    conn = connect()
    cur = conn.cursor()
    cur.execute(sql, args)
    one = cur.fetchone()
    conn.close()
    return one

def exec_get_all(sql, args={}):
    conn = connect()
    cur = conn.cursor()
    cur.execute(sql, args)
    # https://www.psycopg.org/docs/cursor.html#cursor.fetchall

    list_of_tuples = cur.fetchall()
    conn.close()
    return list_of_tuples

def exec_commit(sql, args={}):
    conn = connect()
    cur = conn.cursor()
    result = cur.execute(sql, args)
    conn.commit()
    conn.close()
    return result

def execute_df_values(sql, tuples):
    conn = connect()
    cur = conn.cursor()
    result = extras.execute_values(cur, sql, tuples)
    conn.commit()
    conn.close()
    return result



In [None]:
### Read and Insert data into DB

In [4]:
### ALL INSERT QUERIES

chunk_size = 5000

users_query = """
    INSERT INTO users (Id, AboutMe, AccountId, CreationDate, DisplayName,
    DownVotes, LastAccessDate, Location, Reputation, UpVotes,
    Views, WebsiteUrl) 
    VALUES %s
    """

badges_query = """
    INSERT INTO badges (Id, Class, Date, Name, TagBased, UserId) 
    VALUES %s
    """

posts_query = """
    INSERT INTO posts (Id, AcceptedAnswerId,AnswerCount,Body,
        ClosedDate,CommentCount,CommunityOwnedDate,ContentLicense,
        CreationDate,FavoriteCount,LastActivityDate,LastEditDate,
        LastEditorDisplayName,LastEditorUserId,OwnerDisplayName,
        OwnerUserId, ParentId,PostTypeId, Score,
        Tags, Title, ViewCount) 
    VALUES %s
    """

post_links_query = """
    INSERT INTO postlinks (Id, CreationDate, LinkTypeId, PostId, RelatedPostId)
    VALUES %s
    """

comments_query = """
    INSERT INTO comments (Id, ContentLicense, CreationDate, PostId, Score,
        Text, UserDisplayName, UserId)
    VALUES %s
    """

post_history_query = """
    INSERT INTO posthistory (Id, Comment, ContentLicense, CreationDate, PostHistoryTypeId,
        PostId, RevisionGUID, Text, UserDisplayName, UserId)
    VALUES %s
    """

votes_query = """
    INSERT INTO votes (Id, BountyAmount, CreationDate, PostId, UserId, VoteTypeId)
    VALUES %s
    """

tags_query = """
    INSERT INTO tags (Id, Count, ExcerptPostId, TagName, WikiPostId)
    VALUES %s
    """

In [5]:
# def check_foreign_key_exists(table, att_key, att_val):
#     # SQL query to check if the userid exists in the users table
#     if table.lower() not in {t.lower() for t in input_files} and att_key not in {c.lower() for c in allowed_columns}:
#         raise ValueError("Invalid table name")
#     sql = "SELECT COUNT(Id) FROM " + table + " WHERE " + att_key + " = %s"
#     result = exec_get_one(sql, (att_val,))
#     return result[0] > 0

def insert_chunk_into_db(query, chunk_data):
    execute_df_values(query, chunk_data)
    # print(f"Inserted {len(chunk_data)} records into the database.")


In [None]:
### USERS INSERT
def remove_invalid_entries(chunk_data, valid_ids, loc):
    """
    Remove entries with non-existent FK Ids from chunk_data.
    """
    return [data for data in chunk_data if data[loc] in valid_ids]

def remove_invalid_entries_links(chunk_data, valid_ids, loc):
    """
    Remove entries with non-existent FK Ids from chunk_data.
    """
    return [data for data in chunk_data if int(data[loc]) in valid_ids]

def extract_ids_from_chunk(chunk_data, loc):
    """
    Extract Ids from chunk_data based on the location for the Id in chunk_data.
    """
    return [int(data[loc]) for data in chunk_data]

def check_valid_fk_ids(table, ids):
    """
    Check which user_ids are valid by querying the database.
    Returns a set of valid user_ids.
    """
    valid_ids = set()
    if ids:
        if table.lower() not in {t.lower() for t in input_files}:
            raise ValueError("Invalid table name")
        sql = "SELECT Id FROM " + table + " WHERE Id IN %s"
        result = exec_get_all(sql, (tuple(ids),))
        valid_ids = [row[0] for row in result]
    return valid_ids

In [7]:
import xml.etree.ElementTree as ET
exec_sql_file('create_schema.sql')
#### INSERT INTO USERS

def users_db_chunks(input_file, query, max_chunks=1):
    """
    Process large XML file in chunks of elements and stop after a certain number of chunks.
    
    :param input_file: Path to the large XML file.
    :param chunk_size: Number of elements to process in each chunk.
    :param max_chunks: Maximum number of chunks to process.
    """
    
    context = ET.iterparse(input_file, events=("start", "end"))
    # Skip the root element but start processing children directly
    # _, root = next(context)  # This gets the root but doesn't necessarily process it
    
    chunk_count = 0
    elements_in_chunk = 0
    chunk_data = []  # List to hold data to be inserted into the database

    for event, elem in context:
        if elem.tag == 'row':
            # Collect data from element attributes, handle missing attributes
            if elem.get('Id') is not None:
                element_data = (
                    elem.get('Id'),
                    elem.get('AboutMe'),
                    elem.get('AccountId'),
                    elem.get('CreationDate'),
                    elem.get('DisplayName'),
                    elem.get('DownVotes'),
                    elem.get('LastAccessDate'),
                    elem.get('Location'),
                    elem.get('Reputation'),
                    elem.get('UpVotes'),
                    elem.get('Views'),
                    elem.get('WebsiteUrl')
                )
                chunk_data.append(element_data)
                elements_in_chunk += 1

            # If we've reached the chunk size, save the chunk and reset
            if elements_in_chunk >= chunk_size:
                chunk_count += 1
                execute_df_values(users_query, chunk_data)
                chunk_data = []  # Reset chunk data
                elements_in_chunk = 0

            elem.clear()

            # Stop after processing the max number of chunks # Remove this code
            # if max_chunks > 0 and chunk_count >= max_chunks:
            #     print(f"Reached maximum chunk count ({max_chunks}). Terminating.")
            #     break

    # If there are any remaining elements in the final chunk, save it
    if elements_in_chunk > 0:
        chunk_count += 1
        execute_df_values(query, chunk_data)

    print(f"Processed {chunk_count} chunks of {chunk_size} elements each.")

for file in ["Users"]:
    input_file = "askubuntu/" + file + ".xml"  # Path to your large XML file
    users_db_chunks(input_file, users_query, max_chunks=1)


Processed 291 chunks of 5000 elements each.


In [8]:
# def check_foreign_keys_in_chunk(table, ids):
#     # Create a query to check the existence of multiple IDs
#     formatted_ids = ','.join(map(str, ids))
#     sql = f"SELECT COUNT(*) FROM {table} WHERE Id IN ({formatted_ids})"
#     result = exec_get_one(sql)
#     return result[0]

# def check_foreign_key_exists(table, att_key, att_val):
#     # SQL query to check if the userid exists in the users table
#     if table.lower() not in {t.lower() for t in input_files} and att_key not in {c.lower() for c in allowed_columns}:
#         raise ValueError("Invalid table name")
#     sql = "SELECT COUNT(Id) FROM " + table + " WHERE " + att_key + " = %s"
#     result = exec_get_one(sql, (att_val,))
#     return result[0] > 0



def remove_invalid_entries(chunk_data, valid_ids, loc):
    """
    Remove entries with non-existent FK Ids from chunk_data.
    """
    return [data for data in chunk_data if data[loc] in valid_ids]

def remove_invalid_entries_links(chunk_data, valid_ids, loc):
    """
    Remove entries with non-existent FK Ids from chunk_data.
    """
    return [data for data in chunk_data if int(data[loc]) in valid_ids]

def extract_ids_from_chunk(chunk_data, loc):
    """
    Extract Ids from chunk_data based on the location for the Id in chunk_data.
    """
    return [int(data[loc]) for data in chunk_data]

def check_valid_fk_ids(table, ids):
    """
    Check which user_ids are valid by querying the database.
    Returns a set of valid user_ids.
    """
    valid_ids = set()
    if ids:
        if table.lower() not in {t.lower() for t in input_files}:
            raise ValueError("Invalid table name")
        sql = "SELECT Id FROM " + table + " WHERE Id IN %s"
        result = exec_get_all(sql, (tuple(ids),))
        valid_ids = [row[0] for row in result]
    return valid_ids

In [None]:
### Badges Insert

In [9]:
#### INSERT INTO BADGES
exec_sql_file('create_schema.sql')

def badges_db_chunks(input_file, query, max_chunks=1):
    """
    Process large XML file in chunks of elements and stop after a certain number of chunks.
    
    :param input_file: Path to the large XML file.
    :param max_chunks: Maximum number of chunks to process. MUST REMOVE
    """
    
    context = ET.iterparse(input_file, events=("start", "end"))
    
    # Skip the root element but start processing children directly
    # _, root = next(context)  # This gets the root but doesn't necessarily process it
    
    chunk_count = 0
    elements_in_chunk = 0
    chunk_data = []  # List to hold data to be inserted into the database
    for event, elem in context:
        if elem.tag == 'row':
            # Collect data from element attributes, handle missing attributes
            if elem.get('UserId') is not None:
                element_data = (
                        elem.get('Id'),
                        elem.get('Class'),
                        elem.get('Date'),
                        elem.get('Name'),
                        elem.get('TagBased'),
                        elem.get('UserId')
                )
                chunk_data.append(element_data)
                elements_in_chunk += 1

            # If we've reached the chunk size, save the chunk and reset
            if elements_in_chunk >= chunk_size:
                chunk_count += 1
                user_ids = extract_ids_from_chunk(chunk_data, -1)
                valid_user_ids = check_valid_fk_ids('users', user_ids)
                if len(valid_user_ids) == len(set(user_ids)):
                    # All UserIds are valid, proceed with bulk insertion
                    execute_df_values(query, chunk_data)
                    chunk_data = []  # Reset chunk data
                    elements_in_chunk = 0
                else:
                    # Some UserIds are invalid, filter out invalid entries
                    valid_chunk_data = remove_invalid_entries_links(chunk_data, valid_user_ids, -1)
                    execute_df_values(query, valid_chunk_data)
                    chunk_data = []  # Reset chunk data
                    elements_in_chunk = 0
                        
            elem.clear()

            # Stop after processing the max number of chunks
#             if chunk_count >= max_chunks:
#                 print(f"Reached maximum chunk count ({max_chunks}). Terminating.")
#                 break

    # If there are any remaining elements in the final chunk, save it
    if elements_in_chunk > 0: # and chunk_count < max_chunks:
        chunk_count += 1
        user_ids = extract_ids_from_chunk(chunk_data, -1)
        valid_user_ids = check_valid_fk_ids('users', user_ids)
        if len(valid_user_ids) == len(set(user_ids)):
            # All UserIds are valid, proceed with bulk insertion
            execute_df_values(query, chunk_data)
        else:
            # Some UserIds are invalid, filter out invalid entries
            valid_chunk_data = remove_invalid_entries_links(chunk_data, valid_user_ids, -1)
            execute_df_values(query, valid_chunk_data)

    print(f"Processed {chunk_count} chunks of {chunk_size} elements each.")

for file in ["Badges"]:
    input_file = "askubuntu/" + file + ".xml"  # Path to your large XML file
    badges_db_chunks(input_file, badges_query, max_chunks=1)

Processed 396 chunks of 5000 elements each.


In [None]:
### POSTS INSERTS

In [None]:
#### INSERT INTO POSTS
exec_sql_file('create_schema.sql')

def posts_db_chunks(input_file, query, max_chunks=1):
    """
    Process large XML file in chunks of elements and stop after a certain number of chunks.
    
    :param input_file: Path to the large XML file.
    :param chunk_size: Number of elements to process in each chunk.
    :param max_chunks: Maximum number of chunks to process. MUST REMOVE
    """
    
    context = ET.iterparse(input_file, events=("start", "end"))
    
    # Skip the root element but start processing children directly
    # _, root = next(context)  # This gets the root but doesn't necessarily process it
    
    chunk_count = 0
    elements_in_chunk = 0
    chunk_data = []  # List to hold data to be inserted into the database

    for event, elem in context:
        if elem.tag == 'row':
            # Collect data from element attributes, handle missing attributes
            if elem.get('Id') is not None and elem.get('OwnerUserId') is not None:
                element_data = (
                    elem.get('Id'),elem.get('AcceptedAnswerId'),elem.get('AnswerCount'),
                    elem.get('Body'),elem.get('ClosedDate'),elem.get('CommentCount'),
                    elem.get('CommunityOwnedDate'),elem.get('ContentLicense'),
                    elem.get('CreationDate'),elem.get('FavoriteCount'),
                    elem.get('LastActivityDate'),elem.get('LastEditDate'),
                    elem.get('LastEditorDisplayName'),
                    elem.get('LastEditorUserId'),
                    elem.get('OwnerDisplayName'),
                    elem.get('OwnerUserId'),elem.get('ParentId'),
                    elem.get('PostTypeId'),elem.get('Score'),
                    elem.get('Score'),elem.get('Title'),
                    elem.get('ViewCount')
                )
                chunk_data.append(element_data)
                elements_in_chunk += 1

            # If we've reached the chunk size, save the chunk and reset
            if elements_in_chunk >= chunk_size:
                chunk_count += 1
                print(chunk_data)
                user_ids = extract_ids_from_chunk(chunk_data, -7)
                valid_user_ids = check_valid_fk_ids('users', user_ids)
                if len(valid_user_ids) == len(set(user_ids)):
                    # All UserIds are valid, proceed with bulk insertion
                    execute_df_values(query, chunk_data)
                    chunk_data = []  # Reset chunk data
                    elements_in_chunk = 0
                else:
                    # Some UserIds are invalid, filter out invalid entries
                    valid_chunk_data = remove_invalid_entries_links(chunk_data, valid_user_ids, -7)
                    execute_df_values(query, valid_chunk_data)
                    chunk_data = []  # Reset chunk data
                    elements_in_chunk = 0

            elem.clear()

            # Stop after processing the max number of chunks
#             if chunk_count >= max_chunks:
#                 print(f"Reached maximum chunk count ({max_chunks}). Terminating.")
#                 break

    # If there are any remaining elements in the final chunk, save it
    if elements_in_chunk > 0: # and chunk_count < max_chunks:
        chunk_count += 1
        user_ids = extract_ids_from_chunk(chunk_data, -7)
        valid_user_ids = check_valid_fk_ids('users', user_ids)
        if len(valid_user_ids) == len(set(user_ids)):
            # All UserIds are valid, proceed with bulk insertion
            execute_df_values(query, chunk_data)
        else:
            # Some UserIds are invalid, filter out invalid entries
            valid_chunk_data = remove_invalid_entries_links(chunk_data, valid_user_ids, -7)
            execute_df_values(query, valid_chunk_data)

    print(f"Processed {chunk_count} chunks of {chunk_size} elements each.")

for file in ["Posts"]:
    input_file = "askubuntu/" + file + ".xml"  # Path to your large XML file
    posts_db_chunks(input_file, posts_query, max_chunks=1)


In [None]:
### POSTLINKS INSERT

In [None]:
#### INSERT INTO POSTLINKS
exec_sql_file('create_schema.sql')

def post_links_db_chunks(input_file, query, max_chunks=1):
    """
    Process large XML file in chunks of elements and stop after a certain number of chunks.
    
    :param input_file: Path to the large XML file.
    :param chunk_size: Number of elements to process in each chunk.
    :param max_chunks: Maximum number of chunks to process. MUST REMOVE
    """
    
    context = ET.iterparse(input_file, events=("start", "end"))
    
    # Skip the root element but start processing children directly
    # _, root = next(context)  # This gets the root but doesn't necessarily process it
    
    chunk_count = 0
    elements_in_chunk = 0
    chunk_data = []  # List to hold data to be inserted into the database

    for event, elem in context:
        if elem.tag == 'row':
            # Collect data from element attributes, handle missing attributes
            if elem.get('Id') is not None and elem.get('PostId') is not None and elem.get('RelatedPostId') is not None:
                element_data = (
                    elem.get('Id'),elem.get('CreationDate'),
                    elem.get('LinkTypeId'),elem.get('PostId'),
                    elem.get('RelatedPostId')
                )
                chunk_data.append(element_data)
                elements_in_chunk += 1

            # If we've reached the chunk size, save the chunk and reset
            if elements_in_chunk >= chunk_size:
                chunk_count += 1
                post_ids = extract_ids_from_chunk(chunk_data, -2)
                valid_post_ids = check_valid_fk_ids('posts', post_ids)
                related_post_ids = extract_ids_from_chunk(chunk_data, -1)
                valid_related_post_ids = check_valid_fk_ids('posts', related_post_ids)
                
                if len(valid_post_ids) == len(set(post_ids)) and len(valid_related_post_ids) == len(set(related_post_ids)):
                    # All UserIds are valid, proceed with bulk insertion
                    insert_chunk_into_db(query, chunk_data)
                    chunk_data = []  # Reset chunk data
                    elements_in_chunk = 0
                else:
                    # Some UserIds are invalid, filter out invalid entries
#                     valid_chunk_data = remove_invalid_entries(chunk_data, valid_post_ids, -2)
                    valid_chunk_data = remove_invalid_entries_links(chunk_data, valid_post_ids, -2)
                    valid_chunk_data = remove_invalid_entries_links(valid_chunk_data, valid_related_post_ids, -1)
                    insert_chunk_into_db(query, valid_chunk_data)
                    chunk_data = []  # Reset chunk data
                    elements_in_chunk = 0

            elem.clear()

            # Stop after processing the max number of chunks
#             if chunk_count >= max_chunks:
#                 print(f"Reached maximum chunk count ({max_chunks}). Terminating.")
#                 break

    # If there are any remaining elements in the final chunk, save it
    if elements_in_chunk > 0: # and chunk_count < max_chunks:
        chunk_count += 1
        post_ids = extract_ids_from_chunk(chunk_data, -2)
        valid_post_ids = check_valid_fk_ids('posts', post_ids)
        related_post_ids = extract_ids_from_chunk(chunk_data, -1)
        valid_related_post_ids = check_valid_fk_ids('posts', related_post_ids)
        if len(valid_post_ids) == len(set(post_ids)) and len(valid_related_post_ids) == len(set(related_post_ids)):
            # All UserIds are valid, proceed with bulk insertion
            insert_chunk_into_db(query, chunk_data)
        else:
            # Some UserIds are invalid, filter out invalid entries
#             valid_chunk_data = remove_invalid_entries(chunk_data, valid_post_ids)
            valid_chunk_data = remove_invalid_entries_links(chunk_data, valid_post_ids, -2)
            valid_chunk_data = remove_invalid_entries_links(valid_chunk_data, valid_related_post_ids, -1)
            insert_chunk_into_db(query, valid_chunk_data)

    print(f"Processed {chunk_count} chunks of {chunk_size} elements each.")

for file in ["PostLinks"]:
    input_file = "askubuntu/" + file + ".xml"  # Path to your large XML file
    post_links_db_chunks(input_file, post_links_query, max_chunks=1)


In [None]:
### INSERT INTO COMMENTS

In [None]:
#### INSERT INTO COMMENTS
exec_sql_file('create_schema.sql')

def comments_db_chunks(input_file, query, max_chunks=1):
    """
    Process large XML file in chunks of elements and stop after a certain number of chunks.
    
    :param input_file: Path to the large XML file.
    :param chunk_size: Number of elements to process in each chunk.
    :param max_chunks: Maximum number of chunks to process. MUST REMOVE
    """
    
    context = ET.iterparse(input_file, events=("start", "end"))
    
    # Skip the root element but start processing children directly
    # _, root = next(context)  # This gets the root but doesn't necessarily process it
    
    chunk_count = 0
    elements_in_chunk = 0
    chunk_data = []  # List to hold data to be inserted into the database

    for event, elem in context:
        if elem.tag == 'row':
            # Collect data from element attributes, handle missing attributes
            if elem.get('Id') is not None and elem.get('PostId') is not None and elem.get('UserId') is not None:
                element_data = (
                    elem.get('Id'),elem.get('ContentLicense'),
                    elem.get('CreationDate'),elem.get('PostId'),
                    elem.get('Score'),elem.get('Text'),
                    elem.get('UserDisplayName'),elem.get('UserId')
                )
                chunk_data.append(element_data)
                elements_in_chunk += 1

            # If we've reached the chunk size, save the chunk and reset
            if elements_in_chunk >= chunk_size:
                chunk_count += 1
                post_ids = extract_ids_from_chunk(chunk_data, -5)
                valid_post_ids = check_valid_fk_ids('posts', post_ids)
                user_ids = extract_ids_from_chunk(chunk_data, -1)
                valid_user_ids = check_valid_fk_ids('users', user_ids)
                
                if len(valid_post_ids) == len(set(post_ids)) and len(user_ids) == len(set(valid_user_ids)):
                    # All UserIds are valid, proceed with bulk insertion
                    insert_chunk_into_db(query, chunk_data)
                    chunk_data = []  # Reset chunk data
                    elements_in_chunk = 0
                else:
                    # Some UserIds are invalid, filter out invalid entries
#                     valid_chunk_data = remove_invalid_entries(chunk_data, valid_post_ids, -2)
                    valid_chunk_data = remove_invalid_entries_links(chunk_data, valid_post_ids, -5)
                    valid_chunk_data = remove_invalid_entries_links(valid_chunk_data, valid_user_ids, -1)
                    insert_chunk_into_db(query, valid_chunk_data)
                    chunk_data = []  # Reset chunk data
                    elements_in_chunk = 0

            elem.clear()

            # Stop after processing the max number of chunks
#             if chunk_count >= max_chunks:
#                 print(f"Reached maximum chunk count ({max_chunks}). Terminating.")
#                 break

    # If there are any remaining elements in the final chunk, save it
    if elements_in_chunk > 0: # and chunk_count < max_chunks:
        chunk_count += 1
        post_ids = extract_ids_from_chunk(chunk_data, -5)
        valid_post_ids = check_valid_fk_ids('posts', post_ids)
        user_ids = extract_ids_from_chunk(chunk_data, -1)
        valid_user_ids = check_valid_fk_ids('users', user_ids)
        if len(valid_post_ids) == len(set(post_ids)) and len(valid_user_ids) == len(set(user_ids)):
            # All UserIds are valid, proceed with bulk insertion
            insert_chunk_into_db(query, chunk_data)
        else:
            # Some UserIds are invalid, filter out invalid entries
#             valid_chunk_data = remove_invalid_entries(chunk_data, valid_post_ids)
            valid_chunk_data = remove_invalid_entries_links(chunk_data, valid_post_ids, -5)
            valid_chunk_data = remove_invalid_entries_links(valid_chunk_data, valid_user_ids, -1)
            insert_chunk_into_db(query, valid_chunk_data)

    print(f"Processed {chunk_count} chunks of {chunk_size} elements each.")


for file in ["Comments"]:
    input_file = "askubuntu/" + file + ".xml"  # Path to your large XML file
    comments_db_chunks(input_file, comments_query, max_chunks=1)
    

In [None]:
#### INSERT INTO POST HISTORY
exec_sql_file('create_schema.sql')

def post_hist_db_chunks(input_file, query, max_chunks=1):
    """
    Process large XML file in chunks of elements and stop after a certain number of chunks.
    
    :param input_file: Path to the large XML file.
    :param chunk_size: Number of elements to process in each chunk.
    :param max_chunks: Maximum number of chunks to process. MUST REMOVE
    """
    
    context = ET.iterparse(input_file, events=("start", "end"))
    
    # Skip the root element but start processing children directly
    # _, root = next(context)  # This gets the root but doesn't necessarily process it
    
    chunk_count = 0
    elements_in_chunk = 0
    chunk_data = []  # List to hold data to be inserted into the database

    for event, elem in context:
        if elem.tag == 'row':
            # Collect data from element attributes, handle missing attributes
            if elem.get('Id') is not None and elem.get('PostId') is not None and elem.get('UserId') is not None:
                element_data = (
                    elem.get('Id'),elem.get('Comment'),
                    elem.get('ContentLicense'),elem.get('CreationDate'),
                    elem.get('PostHistoryTypeId'),
                    elem.get('PostId'),elem.get('RevisionGUID'),
                    elem.get('Text'),elem.get('UserDisplayName'),
                    elem.get('UserId')
                )
                chunk_data.append(element_data)
                elements_in_chunk += 1

            # If we've reached the chunk size, save the chunk and reset
            if elements_in_chunk >= chunk_size:
                chunk_count += 1
                post_ids = extract_ids_from_chunk(chunk_data, -5)
                valid_post_ids = check_valid_fk_ids('posts', post_ids)
                user_ids = extract_ids_from_chunk(chunk_data, -1)
                valid_user_ids = check_valid_fk_ids('users', user_ids)
                
                if len(valid_post_ids) == len(set(post_ids)) and len(user_ids) == len(set(valid_user_ids)):
                    # All UserIds are valid, proceed with bulk insertion
                    insert_chunk_into_db(query, chunk_data)
                    chunk_data = []  # Reset chunk data
                    elements_in_chunk = 0
                else:
                    # Some UserIds are invalid, filter out invalid entries
#                     valid_chunk_data = remove_invalid_entries(chunk_data, valid_post_ids, -2)
                    valid_chunk_data = remove_invalid_entries_links(chunk_data, valid_post_ids, -5)
                    valid_chunk_data = remove_invalid_entries_links(valid_chunk_data, valid_user_ids, -1)
                    insert_chunk_into_db(query, valid_chunk_data)
                    chunk_data = []  # Reset chunk data
                    elements_in_chunk = 0

            elem.clear()

            # Stop after processing the max number of chunks
#             if chunk_count >= max_chunks:
#                 print(f"Reached maximum chunk count ({max_chunks}). Terminating.")
#                 break

    # If there are any remaining elements in the final chunk, save it
    if elements_in_chunk > 0: # and chunk_count < max_chunks:
        chunk_count += 1
        post_ids = extract_ids_from_chunk(chunk_data, -5)
        valid_post_ids = check_valid_fk_ids('posts', post_ids)
        user_ids = extract_ids_from_chunk(chunk_data, -1)
        valid_user_ids = check_valid_fk_ids('users', user_ids)
        if len(valid_post_ids) == len(set(post_ids)) and len(valid_user_ids) == len(set(user_ids)):
            # All UserIds are valid, proceed with bulk insertion
            insert_chunk_into_db(query, chunk_data)
        else:
            # Some UserIds are invalid, filter out invalid entries
#             valid_chunk_data = remove_invalid_entries(chunk_data, valid_post_ids)
            valid_chunk_data = remove_invalid_entries_links(chunk_data, valid_post_ids, -5)
            valid_chunk_data = remove_invalid_entries_links(valid_chunk_data, valid_user_ids, -1)
            insert_chunk_into_db(query, valid_chunk_data)

    print(f"Processed {chunk_count} chunks of {chunk_size} elements each.")

for file in ["PostHistory"]:
    input_file = "askubuntu/" + file + ".xml"  # Path to your large XML file
    post_hist_db_chunks(input_file, post_history_query, max_chunks=1)
    
    


In [None]:
#### INSERT INTO USERS
exec_sql_file('create_schema.sql')

def votes_db_chunks(input_file, query, max_chunks=1):
    """
    Process large XML file in chunks of elements and stop after a certain number of chunks.
    
    :param input_file: Path to the large XML file.
    :param chunk_size: Number of elements to process in each chunk.
    :param max_chunks: Maximum number of chunks to process. MUST REMOVE
    """
    
    context = ET.iterparse(input_file, events=("start", "end"))
    
    # Skip the root element but start processing children directly
    # _, root = next(context)  # This gets the root but doesn't necessarily process it
    
    chunk_count = 0
    elements_in_chunk = 0
    chunk_data = []  # List to hold data to be inserted into the database

    for event, elem in context:
        if elem.tag == 'row':
            # Collect data from element attributes, handle missing attributes
            if elem.get('Id') is not None and elem.get('PostId') is not None:
                element_data = (
                    elem.get('Id'),elem.get('BountyAmount'),
                    elem.get('CreationDate'),elem.get('PostId'),
                    elem.get('UserId'),elem.get('VoteTypeId')
                )
                chunk_data.append(element_data)
                elements_in_chunk += 1

            # If we've reached the chunk size, save the chunk and reset
            if elements_in_chunk >= chunk_size:
                chunk_count += 1
                post_ids = extract_ids_from_chunk(chunk_data, -3)
                valid_post_ids = check_valid_fk_ids('posts', post_ids)
                
                if len(valid_post_ids) == len(set(post_ids)):
                    # All UserIds are valid, proceed with bulk insertion
                    insert_chunk_into_db(query, chunk_data)
                    chunk_data = []  # Reset chunk data
                    elements_in_chunk = 0
                else:
                    # Some UserIds are invalid, filter out invalid entries
#                     valid_chunk_data = remove_invalid_entries(chunk_data, valid_post_ids, -2)
                    valid_chunk_data = remove_invalid_entries_links(chunk_data, valid_post_ids, -3)
                    insert_chunk_into_db(query, valid_chunk_data)
                    chunk_data = []  # Reset chunk data
                    elements_in_chunk = 0

            elem.clear()

            # Stop after processing the max number of chunks
            # if chunk_count >= max_chunks:
            #     print(f"Reached maximum chunk count ({max_chunks}). Terminating.")
            #     break

    # If there are any remaining elements in the final chunk, save it
    if elements_in_chunk > 0: # and chunk_count < max_chunks:
        chunk_count += 1
        post_ids = extract_ids_from_chunk(chunk_data, -3)
        valid_post_ids = check_valid_fk_ids('posts', post_ids)
        if len(valid_post_ids) == len(set(post_ids)):
            # All UserIds are valid, proceed with bulk insertion
            insert_chunk_into_db(query, chunk_data)
        else:
            # Some UserIds are invalid, filter out invalid entries
#             valid_chunk_data = remove_invalid_entries(chunk_data, valid_post_ids)
            valid_chunk_data = remove_invalid_entries_links(chunk_data, valid_post_ids, -3)
            insert_chunk_into_db(query, valid_chunk_data)

    print(f"Processed {chunk_count} chunks of {chunk_size} elements each.")

for file in ["Votes"]:
    chunk_size = 5
    input_file = "askubuntu/" + file + ".xml"  # Path to your large XML file
    votes_db_chunks(input_file, votes_query, max_chunks=1)


In [23]:
#### INSERT INTO USERS
exec_sql_file('create_schema.sql')

def tags_db_chunks(input_file, query, max_chunks=1):
    """
    Process large XML file in chunks of elements and stop after a certain number of chunks.
    
    :param input_file: Path to the large XML file.
    :param chunk_size: Number of elements to process in each chunk.
    :param max_chunks: Maximum number of chunks to process. MUST REMOVE
    """
    
    context = ET.iterparse(input_file, events=("start", "end"))
    
    # Skip the root element but start processing children directly
    # _, root = next(context)  # This gets the root but doesn't necessarily process it
    
    chunk_count = 0
    elements_in_chunk = 0
    chunk_data = []  # List to hold data to be inserted into the database

    for event, elem in context:
        if elem.tag == 'row':
            # Collect data from element attributes, handle missing attributes
            if elem.get('Id') is not None and elem.get('ExcerptPostId') is not None and elem.get('WikiPostId'):
                element_data = (
                    elem.get('Id'),elem.get('Count'),
                    elem.get('ExcerptPostId'),
                    elem.get('TagName'),
                    elem.get('WikiPostId')
                )
                chunk_data.append(element_data)
                elements_in_chunk += 1

            # If we've reached the chunk size, save the chunk and reset
            if elements_in_chunk >= chunk_size:
                chunk_count += 1
                post_ids = extract_ids_from_chunk(chunk_data, -3)
                valid_post_ids = check_valid_fk_ids('posts', post_ids)
                wiki_ids = extract_ids_from_chunk(chunk_data, -1)
                valid_wiki_ids = check_valid_fk_ids('posts', wiki_ids)
                
                if len(valid_post_ids) == len(set(post_ids)) and len(wiki_ids) == len(set(valid_wiki_ids)):
                    # All UserIds are valid, proceed with bulk insertion
                    insert_chunk_into_db(query, chunk_data)
                    chunk_data = []  # Reset chunk data
                    elements_in_chunk = 0
                else:
                    # Some UserIds are invalid, filter out invalid entries
#                     valid_chunk_data = remove_invalid_entries(chunk_data, valid_post_ids, -2)
                    valid_chunk_data = remove_invalid_entries_links(chunk_data, valid_post_ids, -3)
                    valid_chunk_data = remove_invalid_entries_links(valid_chunk_data, valid_wiki_ids, -1)
                    insert_chunk_into_db(query, valid_chunk_data)
                    chunk_data = []  # Reset chunk data
                    elements_in_chunk = 0

            elem.clear()

            # Stop after processing the max number of chunks
#             if chunk_count >= max_chunks:
#                 print(f"Reached maximum chunk count ({max_chunks}). Terminating.")
#                 break

    # If there are any remaining elements in the final chunk, save it
    if elements_in_chunk > 0: # and chunk_count < max_chunks:
        chunk_count += 1
        post_ids = extract_ids_from_chunk(chunk_data, -3)
        valid_post_ids = check_valid_fk_ids('posts', post_ids)
        wiki_ids = extract_ids_from_chunk(chunk_data, -1)
        valid_wiki_ids = check_valid_fk_ids('posts', wiki_ids)
        if len(valid_post_ids) == len(set(post_ids)) and len(valid_wiki_ids) == len(set(wiki_ids)):
            # All UserIds are valid, proceed with bulk insertion
            insert_chunk_into_db(query, chunk_data)
        else:
            # Some UserIds are invalid, filter out invalid entries
#             valid_chunk_data = remove_invalid_entries(chunk_data, valid_post_ids)
            valid_chunk_data = remove_invalid_entries_links(chunk_data, valid_post_ids, -3)
            valid_chunk_data = remove_invalid_entries_links(valid_chunk_data, valid_wiki_ids, -1)
            insert_chunk_into_db(query, valid_chunk_data)
    
    print(f"Processed {chunk_count} chunks of {chunk_size} elements each.")

for file in ["Tags"]:
    chunk_size = 5000
    input_file = "askubuntu/" + file + ".xml"  # Path to your large XML file
    tags_db_chunks(input_file, tags_query, max_chunks=1)
    

Inserted 2443 records into the database.
Processed 1 chunks of 5000 elements each.


In [1]:
import numpy as np
import time

# Generate example arrays
array1 = np.random.randint(0, 1000000, size=1000000).tolist()  # Array of 1 million items
array2 = np.random.randint(0, 1000000, size=500000).tolist()    # Array of half a million items

# Start timing
start_time = time.time()

# Convert array1 to a set for faster lookup
set1 = set(array1)

# Find items in array2 that are not in array1
result = [item for item in array2 if item not in set1]

# End timing
end_time = time.time()

# Calculate elapsed time
elapsed_time = end_time - start_time

print(f"Items not in array1: {len(result)}")
print(f"Time taken: {elapsed_time:.6f} seconds")


Items not in array1: 183978
Time taken: 0.218802 seconds


: 