# Pseudoanonymized YouTube data extraction

In [1]:
import pandas as pd
import hashlib
import json
import os
import secrets

In [2]:
# CONSTANTS
MAPPING_DIR = '../data/mappings' 
SALT = secrets.token_hex(16) # Randomly generated salt for hashing. It may be changed if you want to rehash the IDs.

In [3]:
# Create the directory if it doesn't exist
os.makedirs(MAPPING_DIR, exist_ok=True)

In [4]:
def hash_with_salt(value, salt):
    return  '' if pd.isnull(value) else hashlib.sha256((str(value) + salt).encode('utf-8')).hexdigest()

In [5]:
def anonymize_dataframe_columns(csv_path, columns, col_only_remove, salt):
    df = pd.read_csv(csv_path)
    
    # Mapping dict
    mapping_dict = {
        "salt": salt,
        "mappings": {}
    }
    
    for col in columns:
        col_mapping = {}
        
        df[col + "_anon"] = df[col].apply(lambda value: hash_with_salt(value, salt))    
        
        for original, anon in zip(df[col], df[col + "_anon"]):
            if pd.isnull(original):
                continue
            col_mapping[anon] = original
        mapping_dict["mappings"][col] = col_mapping
    
    # Replace original columns with anonymized ones
    for col in columns:
        df[col] = df[col + "_anon"]
        df.drop(columns=[col + "_anon"], inplace=True)
    
    # Remove only specified columns
    df.drop(columns=col_only_remove, inplace=True)
    
    # Remove any unnamed columns (like index columns) 
    if "Unnamed: 0" in df.columns:
        df.drop(columns=["Unnamed: 0"], inplace=True)
    
    # Save dict to JSON files
    with open(os.path.join(MAPPING_DIR, 'mapping.json'), 'w') as f:
        json.dump(mapping_dict, f, ensure_ascii=False, indent=4)
    print(f"Mapping saved to: {os.path.join(MAPPING_DIR, 'mapping.json')}")
    
    return df

def save_anonymized_csv(df: pd.DataFrame, output_path):
    df.to_csv(output_path, index=False)
    print(f"Anonymized CSV saved to: {output_path}")

In [6]:
def deanonymize_dataframe_columns(csv_path, columns, mapping_path, output_path):
    df = pd.read_csv(csv_path)

    with open(mapping_path, 'r', encoding="utf-8") as f:
        mapping_dict = json.load(f)
        
    mappings = mapping_dict["mappings"]
    
    for col in columns:
        if col not in mappings:
            print(f"Warning: No mapping found for column '{col}'. Skipping.")
            continue
        
        col_mapping = mappings[col]
        df[col] = df[col].map(col_mapping)
        
    # Save the restored DataFrame to a new CSV file
    df.to_csv(output_path, index=False)
    print(f"Deanonymized CSV saved to: {output_path}")
    
    return df


In [7]:
anonymized_data = anonymize_dataframe_columns('../data/comments_with_sentiment_class.csv', ['comment_id', 'author_id', 'reply_to_comment_id', 'video_id'], ['author_name', 'text'], SALT)
save_anonymized_csv(anonymized_data, '../data/comments_with_sentiment_class_anonymized.csv')

Mapping saved to: ../data/mappings/mapping.json
Anonymized CSV saved to: ../data/comments_with_sentiment_class_anonymized.csv


In [8]:
deanomymized_data = deanonymize_dataframe_columns('../data/comments_with_sentiment_class_anonymized.csv', ['comment_id', 'author_id', 'reply_to_comment_id', 'video_id'], MAPPING_DIR+'/mapping.json', '../data/comments_with_sentiment_class_deanonymized.csv')

Deanonymized CSV saved to: ../data/comments_with_sentiment_class_deanonymized.csv
