In [1]:
import pandas as pd
import os
from tqdm.notebook import tqdm
from collections import defaultdict

pd.set_option("display.width", 500)
pd.set_option("display.max_columns", 100)

In [None]:
# Define the directory containing your CSV files
csv_directory = "../data/"

# Define the output directory for user action files
output_directory = "../data_users/"
os.makedirs(output_directory, exist_ok=True)

# Load user IDs from Parquet files
super_users_df = pd.read_parquet("../data_users/SuperUserIds.parquet")
# non_super_users_df = pd.read_parquet("../data_users/NonSuperUserIds.parquet")

# Convert DataFrames to sets for faster lookup
super_user_ids = set(super_users_df["Id"].astype(int))
# non_super_user_ids = set(non_super_users_df["Id"].astype(int))

# Mapping of CSV files to user ID columns and action types
file_mappings = {
    "Votes.csv": {
        "user_id_column": "UserId",
        "action_type_column": "VoteTypeId",
        "action_type_map": {2: "UpVote", 3: "DownVote"},
        "date_column": "CreationDate",
    },
    "Posts.csv": {
        "user_id_column": "OwnerUserId",
        "action_type_column": "PostTypeId",
        "action_type_map": {1: "Question", 2: "Answer"},
        "date_column": "CreationDate",
    },
    "Comments.csv": {
        "user_id_column": "UserId",
        "action_type": "Comment",
        "date_column": "CreationDate",
    },
    "Badges.csv": {
        "user_id_column": "UserId",
        "action_type": "Badge",
        "date_column": "Date",
    },
}

# Define the columns to extract for each file
columns_mapping = {
    "Posts.csv": ["OwnerUserId", "CreationDate", "PostTypeId"],
    "Comments.csv": ["UserId", "CreationDate"],
    "Votes.csv": ["UserId", "CreationDate", "VoteTypeId"],
    "Badges.csv": ["UserId", "Date"],
}

In [11]:
# testing - reading the first 25% of Vots.csv and printing valuecounts of VoteTypeId
votes_df = pd.read_csv("../data/Votes.csv", nrows=1000000, delimiter="\x17")
print(votes_df.head())

   BountyAmount             CreationDate        Id  PostId  UserId  VoteTypeId
0           NaN  2011-05-12T00:00:00.000  14967360    8440     NaN           2
1           NaN  2011-05-12T00:00:00.000  14969494    8440     NaN           2
2           NaN  2011-05-14T00:00:00.000  15032293    8440     NaN           2
3           NaN  2011-05-16T00:00:00.000  15062845    8440     NaN           2
4           NaN  2011-05-25T00:00:00.000  15350663    8440     NaN           2


In [5]:
def estimate_total_chunks(file_path, chunksize, sample_size_bytes=1024*1024):
    total_bytes = os.path.getsize(file_path)
    with open(file_path, 'rb') as f:
        sample_bytes = f.read(sample_size_bytes)
    sample_lines = sample_bytes.count(b'\n') or 1  # Avoid division by zero
    avg_bytes_per_line = sample_size_bytes / sample_lines
    estimated_total_lines = total_bytes / avg_bytes_per_line
    total_chunks = estimated_total_lines / chunksize
    return int(total_chunks)

def process_file(file_name, mapping, columns, user_ids, output_file, chunksize=10**6):
    file_path = os.path.join(csv_directory, file_name)
    print(f"Processing {file_name}...")
    date_column = mapping.get('date_column', 'CreationDate')

    # Estimate total chunks
    total_chunks = estimate_total_chunks(file_path, chunksize)
    print(f"Estimated total chunks: {total_chunks}")

    # Read the CSV in chunks
    try:
        csv_iterator = pd.read_csv(
            file_path,
            chunksize=chunksize,
            usecols=columns,
            parse_dates=[date_column],
            iterator=True,
            delimiter='\x17',
        )
    except FileNotFoundError:
        print(f"File {file_name} not found in {csv_directory}. Skipping.")
        return
    except Exception as e:
        print(f"Error reading {file_name}: {e}")
        return

    action_type_column = mapping.get('action_type_column')
    action_type_map = mapping.get('action_type_map', {})
    default_action_type = mapping.get('action_type', 'Action')

    count_of_skips = 0

    print(f"Filtering actions for {len(user_ids)} target users...")
    for chunk in tqdm(csv_iterator, total=int(total_chunks)):
        # Drop rows with missing user IDs, dates, or action type columns
        dropna_columns = [mapping['user_id_column'], date_column]
        if action_type_column:
            dropna_columns.append(action_type_column)
        chunk.dropna(subset=dropna_columns, inplace=True)

        # Ensure data types are consistent
        chunk[mapping['user_id_column']] = chunk[mapping['user_id_column']].astype(int)
        if action_type_column:
            chunk[action_type_column] = chunk[action_type_column].astype(int)

        # Filter rows where the user ID is in target_user_ids
        filtered_chunk = chunk[chunk[mapping['user_id_column']].isin(user_ids)]

        if filtered_chunk.empty:
            continue  # Skip if no relevant actions in this chunk

        # Prepare action records
        records = []
        for _, row in filtered_chunk.iterrows():
            print(row)
            user_id = row[mapping['user_id_column']]
            creation_date = row[date_column].isoformat()

            if action_type_column:
                action_type_id = row[action_type_column]
                if action_type_id in action_type_map:
                    action_type = action_type_map[action_type_id]
                else:
                    # print(f"Unknown action type ID: {action_type_id} in {file_name}. Skipping.")
                    count_of_skips += 1
                    continue
            else:
                action_type = default_action_type

            records.append(f"{user_id},{action_type},{creation_date}\n")

        # Append records to the consolidated output file
        with open(output_file, 'a', encoding='utf-8') as f:
            f.writelines(records)

    print(f"Finished processing {file_name}.")
    print(f"Skipped {count_of_skips} rows with unknown action types.\n")

In [6]:
non_super_file = "NonSuperUserIdsSample_0.01.parquet"
non_super_sample = pd.read_parquet(f"../data_users/{non_super_file}")
non_super_user_ids = set(non_super_sample["Id"].astype(int))

non_super_output_path = os.path.join(output_directory, "actions_" + non_super_file.replace(".parquet", ".csv"))

print(non_super_output_path)

if os.path.exists(non_super_output_path):
    os.remove(non_super_output_path)
    with open(non_super_output_path, 'w', encoding='utf-8') as f:
        f.write("UserId,ActionType,CreationDate\n")

# Process files for Non-Super Users
for file_name, mapping in file_mappings.items():
    columns = columns_mapping.get(file_name, [mapping['user_id_column'], 'CreationDate'])
    process_file(
        file_name=file_name,
        mapping=mapping,
        columns=columns,
        user_ids=non_super_user_ids,
        output_file=non_super_output_path
    )

../data_users/actions_NonSuperUserIdsSample_0.01.csv
Processing Votes.csv...
Estimated total chunks: 267
Filtering actions for 220533 target users...


  0%|          | 0/267 [00:00<?, ?it/s]

CreationDate    2012-11-01 00:00:00
UserId                       149316
VoteTypeId                        8
Name: 2520735, dtype: object
CreationDate    2009-02-22 00:00:00
UserId                        30480
VoteTypeId                        8
Name: 4093046, dtype: object
CreationDate    2009-08-12 00:00:00
UserId                        44482
VoteTypeId                        8
Name: 6800024, dtype: object
CreationDate    2009-02-03 00:00:00
UserId                           81
VoteTypeId                        8
Name: 7477107, dtype: object
CreationDate    2009-02-24 00:00:00
UserId                        53491
VoteTypeId                        8
Name: 8701296, dtype: object
CreationDate    2018-07-31 00:00:00
UserId                       409102
VoteTypeId                        8
Name: 8837753, dtype: object
CreationDate    2011-01-20 00:00:00
UserId                       209824
VoteTypeId                        8
Name: 9010799, dtype: object
CreationDate    2009-02-16 00:00:00
UserI

KeyboardInterrupt: 

In [None]:
# # Define output file paths
# super_output_path = os.path.join(output_directory, "super_users_actions.csv")

# # remove the output paths if they already exist
# if os.path.exists(super_output_path):
#     os.remove(super_output_path)

# # Process files for Super Users
# for file_name, mapping in file_mappings.items():
#     columns = columns_mapping.get(file_name, [mapping['user_id_column'], 'CreationDate'])
#     process_file(
#         file_name=file_name,
#         mapping=mapping,
#         columns=columns,
#         user_ids=super_user_ids,
#         output_file=super_output_path
#     )

super_user_file = "SuperUserIds.parquet"
super_user_sample = pd.read_parquet(f"../data_users/{super_user_file}")
super_user_ids = set(super_user_sample["Id"].astype(int))
super_output_path = os.path.join(output_directory, "actions_" + super_user_file.replace(".parquet", ".csv"))

# remove the output paths if they already exist
if os.path.exists(super_output_path):
    os.remove(super_output_path)

# Process files for Super Users
for file_name, mapping in file_mappings.items():
    columns = columns_mapping.get(file_name, [mapping['user_id_column'], 'CreationDate'])
    process_file(
        file_name=file_name,
        mapping=mapping,
        columns=columns,
        user_ids=super_user_ids,
        output_file=super_output_path
   )
    
print("Done!")

Processing Votes.csv...
Estimated total chunks: 267
Filtering actions for 22076 target users...


  0%|          | 0/267 [00:00<?, ?it/s]

Finished processing Votes.csv.
Skipped 63554 rows with unknown action types.

Processing Posts.csv...
Estimated total chunks: 1015
Filtering actions for 22076 target users...


  0%|          | 0/1015 [00:00<?, ?it/s]

Finished processing Posts.csv.
Skipped 47408 rows with unknown action types.

Processing Comments.csv...
Estimated total chunks: 95
Filtering actions for 22076 target users...


  0%|          | 0/95 [00:00<?, ?it/s]

Finished processing Comments.csv.
Skipped 0 rows with unknown action types.

Processing Badges.csv...
Estimated total chunks: 56
Filtering actions for 22076 target users...


  0%|          | 0/56 [00:00<?, ?it/s]

Finished processing Badges.csv.
Skipped 0 rows with unknown action types.

Done!
