# Data Merging
This notebook merges `empath_features.csv`, `lsa_features.csv`, and `processed_comments.csv` into a single pickle file.

In [1]:
import pandas as pd
import os

# Define file paths
data_dir = 'data-ai-slop-detector'
empath_path = os.path.join(data_dir, 'empath_features.csv')
lsa_path = os.path.join(data_dir, 'lsa_features.csv')
comments_path = os.path.join(data_dir, 'processed_comments.csv')

# Load the datasets
print("Loading datasets...")
try:
    df_empath = pd.read_csv(empath_path)
    df_lsa = pd.read_csv(lsa_path)
    df_comments = pd.read_csv(comments_path)
    
    print("Datasets loaded successfully.")
except FileNotFoundError as e:
    print(f"Error loading files: {e}")

# Inspect the dataframes
print(f"\nEmpath Features Shape: {df_empath.shape}")
print(f"LSA Features Shape: {df_lsa.shape}")
print(f"Processed Comments Shape: {df_comments.shape}")

print("\n--- Empath Features Columns (First 5) ---")
print(df_empath.columns.tolist()[:5])

print("\n--- LSA Features Columns (First 5) ---")
print(df_lsa.columns.tolist()[:5])

print("\n--- Processed Comments Columns (First 5) ---")
print(df_comments.columns.tolist()[:5])

# Check for length mismatch
if len(df_empath) == len(df_lsa) == len(df_comments):
    print("\nAll dataframes have the same number of rows. Ready to merge.")
else:
    print("\nWARNING: Row counts do not match!")
    print("Merging by index might be incorrect if rows are not aligned.")

Loading datasets...
Datasets loaded successfully.

Empath Features Shape: (139407, 194)
LSA Features Shape: (139407, 62)
Processed Comments Shape: (139407, 15)

--- Empath Features Columns (First 5) ---
['help', 'office', 'dance', 'money', 'wedding']

--- LSA Features Columns (First 5) ---
['lsa_dim_0', 'lsa_dim_1', 'lsa_dim_2', 'lsa_dim_3', 'lsa_dim_4']

--- Processed Comments Columns (First 5) ---
['commenter_id', 'comment_id', 'parent_id', 'post_id', 'comment_content']

All dataframes have the same number of rows. Ready to merge.
Datasets loaded successfully.

Empath Features Shape: (139407, 194)
LSA Features Shape: (139407, 62)
Processed Comments Shape: (139407, 15)

--- Empath Features Columns (First 5) ---
['help', 'office', 'dance', 'money', 'wedding']

--- LSA Features Columns (First 5) ---
['lsa_dim_0', 'lsa_dim_1', 'lsa_dim_2', 'lsa_dim_3', 'lsa_dim_4']

--- Processed Comments Columns (First 5) ---
['commenter_id', 'comment_id', 'parent_id', 'post_id', 'comment_content']

All

In [2]:
# Merge the dataframes
# Note: This assumes the rows are aligned by index.
if len(df_empath) == len(df_lsa) == len(df_comments):
    print("Merging dataframes...")
    df_merged = pd.concat([df_comments, df_empath, df_lsa], axis=1)
    
    # Save to pickle
    output_path = os.path.join(data_dir, 'merged_data.pkl')
    print(f"Saving merged data to {output_path}...")
    df_merged.to_pickle(output_path)
    print("Done.")
else:
    print("Skipping merge due to row count mismatch. Please investigate the data alignment.")

Merging dataframes...
Saving merged data to data-ai-slop-detector/merged_data.pkl...
Done.
Done.


In [3]:
df = pd.read_pickle(output_path)

In [4]:
df

Unnamed: 0,commenter_id,comment_id,parent_id,post_id,comment_content,cleaned_content,num_emojis,num_text_emojis,num_caps_words,num_unicode_chars,...,lsa_dim_52,lsa_dim_53,lsa_dim_54,lsa_dim_55,lsa_dim_56,lsa_dim_57,lsa_dim_58,lsa_dim_59,lsa_dim_60,lsa_dim_61
0,AdamParkhomenko,1,,1,https://t.co/rAkU7CWOVE,link,0,0,0,0,...,0.000013,-0.000126,-0.000344,-0.000402,-0.000916,0.000051,0.000233,0.000026,0.000140,0.000635
1,SusanSaoirse,2,1.0,1,"@AdamParkhomenko Thing is, a good number of ma...",thing good number maga use medicaid oregon med...,0,0,0,0,...,0.032935,0.042518,-0.009026,0.020254,-0.004191,0.003561,0.007515,-0.014450,-0.000272,-0.019075
2,RealStarTrump,3,1.0,1,@AdamParkhomenko You’re a lying bastard. https...,youre lying bastard link,0,0,0,1,...,-0.006881,0.000761,0.006115,0.010444,-0.002105,-0.013546,0.006378,0.002457,-0.002760,0.000702
3,catothewis13876,4,1.0,1,@AdamParkhomenko The false premise is that ill...,false premise illegals get less than american ...,0,0,0,0,...,0.019162,0.061776,-0.055354,-0.008987,0.013624,-0.005087,-0.059042,-0.005017,-0.034343,-0.005884
4,masterson11776,5,1.0,1,@AdamParkhomenko Emergency rooms in So Cal are...,emergency rooms indiana so cal full illegal br...,0,0,0,1,...,0.014652,-0.019525,-0.003320,-0.004874,0.007576,-0.008425,-0.007787,-0.004375,-0.002130,-0.005013
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
139402,ColborneGreg,139403,,44,@nvidia @ylecun @soumithchintala @Meta @nyuniv...,i just received push notification nvidia adver...,0,0,2,0,...,0.000021,-0.019260,-0.032044,0.004067,-0.016273,-0.022032,-0.014603,-0.027949,0.053836,-0.015425
139403,trust_Mina,139404,,44,@nvidia @ylecun @soumithchintala @Meta @nyuniv...,wow seeing indiana nyc makes maine feel so luc...,1,0,3,2,...,-0.085519,-0.004203,-0.065151,-0.042522,0.104675,0.088470,0.076652,0.033047,0.095929,-0.017154
139404,damnThoughtful,139405,,44,@nvidia @ylecun @soumithchintala @Meta @nyuniv...,useless gimped bandwidth buy gpu instead,0,0,1,0,...,-0.001759,-0.003599,0.003692,-0.000820,-0.004070,0.000330,0.001171,0.001398,0.000775,0.000464
139405,honasu,139406,,44,@nvidia @ylecun @soumithchintala @Meta @nyuniv...,nvidia marketing play horrible part positionin...,0,0,2,0,...,-0.033805,-0.065463,-0.013342,0.032919,0.047874,0.043419,-0.010128,0.005432,-0.000098,0.020415


# Twitter NLP

In [5]:
# Ensure ipywidgets is available for notebook progress bars; install if missing.
try:
    import ipywidgets  # noqa
except Exception:
    %pip install ipywidgets jupyterlab_widgets

import tweetnlp
import numpy as np
# Prefer notebook widgets, but fall back to auto version if widgets are unavailable.
try:
    from tqdm.notebook import tqdm
except Exception:
    from tqdm.auto import tqdm
tqdm.pandas()

# Ensure models are loaded
print("Loading TweetNLP models...")
# Load models once to avoid reloading for every comment
try:
    model_sentiment = tweetnlp.load_model('sentiment')
    model_irony = tweetnlp.load_model('irony')
    model_hate = tweetnlp.load_model('hate')
    model_offensive = tweetnlp.load_model('offensive')
    print("Models loaded.")
except Exception as e:
    print(f"Error loading models: {e}")

def analyze_comment(text):
    if not isinstance(text, str):
        return pd.Series([None]*8)
    
    try:
        # Sentiment
        s = model_sentiment.sentiment(text, return_probability=True)
        s_label = s['label']
        # probability is a dict like {'positive': 0.9, ...}
        s_prob = s['probability'][s_label] 
        
        # Irony
        i = model_irony.irony(text, return_probability=True)
        i_label = i['label']
        i_prob = i['probability'][i_label]

        # Hate
        h = model_hate.hate(text, return_probability=True)
        h_label = h['label']
        h_prob = h['probability'][h_label]

        # Offensive
        o = model_offensive.offensive(text, return_probability=True)
        o_label = o['label']
        o_prob = o['probability'][o_label]
        
        return pd.Series([s_label, s_prob, i_label, i_prob, h_label, h_prob, o_label, o_prob])
    except Exception:
        return pd.Series([None]*8)

# Run analysis
print("Running NLP analysis on comments...")
nlp_cols = ['sentiment_label', 'sentiment_prob', 'irony_label', 'irony_prob', 
            'hate_label', 'hate_prob', 'offensive_label', 'offensive_prob']

# Use processed_comments dataframe loaded earlier
# We use 'comment_content' for analysis
if 'df_comments' in locals():
    df_nlp = df_comments['comment_content'].progress_apply(analyze_comment)
    df_nlp.columns = nlp_cols

    # Merge NLP results with comments
    df_comments_nlp = pd.concat([df_comments, df_nlp], axis=1)

    # Merge with Empath and LSA
    print("Merging with Empath and LSA features...")
    # Note: This assumes index alignment. 
    # If rows differ, pandas aligns by index (0 to N).
    df_final = pd.concat([df_comments_nlp, df_empath, df_lsa], axis=1)

    print(f"Final merged shape: {df_final.shape}")

    # Save
    output_pkl = os.path.join(data_dir, 'final_merged_data_nlp.pkl')
    df_final.to_pickle(output_pkl)
    print(f"Saved to {output_pkl}")
else:
    print("df_comments is not defined. Please run the data loading cells first.")

Loading TweetNLP models...


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassifi

Models loaded.
Running NLP analysis on comments...


  0%|          | 0/139407 [00:00<?, ?it/s]

Merging with Empath and LSA features...
Final merged shape: (139407, 279)
Saved to data-ai-slop-detector/final_merged_data_nlp.pkl
Saved to data-ai-slop-detector/final_merged_data_nlp.pkl


In [6]:
df_comments_nlp

Unnamed: 0,commenter_id,comment_id,parent_id,post_id,comment_content,cleaned_content,num_emojis,num_text_emojis,num_caps_words,num_unicode_chars,...,tagged_grok,used_slang,sentiment_label,sentiment_prob,irony_label,irony_prob,hate_label,hate_prob,offensive_label,offensive_prob
0,AdamParkhomenko,1,,1,https://t.co/rAkU7CWOVE,link,0,0,0,0,...,False,True,neutral,0.655992,non_irony,0.509043,NOT-HATE,0.990611,non-offensive,0.694312
1,SusanSaoirse,2,1.0,1,"@AdamParkhomenko Thing is, a good number of ma...",thing good number maga use medicaid oregon med...,0,0,0,0,...,False,True,negative,0.491858,non_irony,0.528372,NOT-HATE,0.829080,non-offensive,0.807302
2,RealStarTrump,3,1.0,1,@AdamParkhomenko You’re a lying bastard. https...,youre lying bastard link,0,0,0,1,...,False,True,negative,0.938928,non_irony,0.690144,NOT-HATE,0.981233,offensive,0.906779
3,catothewis13876,4,1.0,1,@AdamParkhomenko The false premise is that ill...,false premise illegals get less than american ...,0,0,0,0,...,False,True,negative,0.803585,irony,0.940728,NOT-HATE,0.850179,non-offensive,0.609643
4,masterson11776,5,1.0,1,@AdamParkhomenko Emergency rooms in So Cal are...,emergency rooms indiana so cal full illegal br...,0,0,0,1,...,False,True,negative,0.584216,irony,0.585547,HATE,0.937815,offensive,0.640669
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
139402,ColborneGreg,139403,,44,@nvidia @ylecun @soumithchintala @Meta @nyuniv...,i just received push notification nvidia adver...,0,0,2,0,...,False,True,negative,0.677940,non_irony,0.899829,NOT-HATE,0.996151,non-offensive,0.921061
139403,trust_Mina,139404,,44,@nvidia @ylecun @soumithchintala @Meta @nyuniv...,wow seeing indiana nyc makes maine feel so luc...,1,0,3,2,...,False,True,positive,0.987714,non_irony,0.893695,NOT-HATE,0.998502,non-offensive,0.946042
139404,damnThoughtful,139405,,44,@nvidia @ylecun @soumithchintala @Meta @nyuniv...,useless gimped bandwidth buy gpu instead,0,0,1,0,...,False,False,negative,0.894669,non_irony,0.900792,NOT-HATE,0.988562,non-offensive,0.690830
139405,honasu,139406,,44,@nvidia @ylecun @soumithchintala @Meta @nyuniv...,nvidia marketing play horrible part positionin...,0,0,2,0,...,False,True,negative,0.613019,non_irony,0.931100,NOT-HATE,0.997851,non-offensive,0.886043
