In [3]:
import pandas as pd
import os

# Load in chunks for when you have a large CSV file
chunk_size = 500000  # This can be adjusted based on your memory on your machine
chunks = []
for chunk in pd.read_csv('~/code/urop/reddit/data/2024_output/annotated-combined.csv', chunksize=chunk_size):
    chunks.append(chunk)  

data_sentence = pd.concat(chunks)

# Extract the class label from the first character of the 'tid' and create a new 'fraudulent' column
def extract_label(tid):
    if tid[0].isdigit():
        return int(tid[0])
    else:
        return None

data_sentence['fraudulent'] = data_sentence['tid'].astype(str).apply(extract_label)

# Drop rows where the fraudulent label could not be determined
fraud_sent = data_sentence.dropna(subset=['fraudulent'])

# Display the first few rows of the fraud_sent DataFrame
fraud_sent.head()


Unnamed: 0,tid,sid,MLS,MLC,MLT,CpS,CpT,cTT,dCC,cCT,...,WF_Reddit,WF_Podcast,WF_TOTAL,CD_Podcast,CD_TV,CD_Reddit,WP_TV,WP_Podcast,WP_Reddit,fraudulent
0,1_commie_cows_1dud3qj,0,6,6.0,6.0,1,1.0,0.0,0.0,0.0,...,11.6128,10.6873,12.1646,9.0733,9.0621,8.7048,7.7085,6.8104,8.4486,1.0
1,1_commie_cows_1dud3qj,1,7,3.5,7.0,2,2.0,0.0,0.0,0.0,...,8.1331,7.0915,8.5288,6.3807,6.3269,6.0778,5.6476,4.9589,5.9101,1.0
2,1_commie_cows_1dud3qj,2,33,4.125,16.5,8,4.0,1.0,0.75,3.0,...,9.8301,9.0412,10.3439,7.9534,7.8949,7.4688,6.802,6.116,7.2317,1.0
3,1_commie_cows_1dud3qj,3,10,10.0,10.0,1,1.0,1.0,1.0,1.0,...,10.1892,9.0651,10.5977,8.1009,7.8486,7.8375,6.6651,6.1558,7.5704,1.0
4,1_commie_cows_1dud3qj,4,22,7.3333,11.0,3,1.5,0.5,0.3333,0.5,...,9.7998,8.9615,10.2759,7.7304,7.5125,7.2543,6.4307,5.8481,7.0469,1.0


In [4]:
# # here I had another df called "labels" with two columns: "tid" and "fraudulent", which I join with the original data to add the class labels 
# file_path = '/Users/daniel24/Documents/0_Exaia/Research/FraudDetection/Data/fake_job_labels.csv'  # Replace with your file path
# labels = pd.read_csv(file_path)
# fraud_sent = pd.merge(data_sentence, labels, on='tid', how='inner')

In [5]:
# so this how your CYMO data look like 
fraud_sent.head()

Unnamed: 0,tid,sid,MLS,MLC,MLT,CpS,CpT,cTT,dCC,cCT,...,WF_Reddit,WF_Podcast,WF_TOTAL,CD_Podcast,CD_TV,CD_Reddit,WP_TV,WP_Podcast,WP_Reddit,fraudulent
0,1_commie_cows_1dud3qj,0,6,6.0,6.0,1,1.0,0.0,0.0,0.0,...,11.6128,10.6873,12.1646,9.0733,9.0621,8.7048,7.7085,6.8104,8.4486,1.0
1,1_commie_cows_1dud3qj,1,7,3.5,7.0,2,2.0,0.0,0.0,0.0,...,8.1331,7.0915,8.5288,6.3807,6.3269,6.0778,5.6476,4.9589,5.9101,1.0
2,1_commie_cows_1dud3qj,2,33,4.125,16.5,8,4.0,1.0,0.75,3.0,...,9.8301,9.0412,10.3439,7.9534,7.8949,7.4688,6.802,6.116,7.2317,1.0
3,1_commie_cows_1dud3qj,3,10,10.0,10.0,1,1.0,1.0,1.0,1.0,...,10.1892,9.0651,10.5977,8.1009,7.8486,7.8375,6.6651,6.1558,7.5704,1.0
4,1_commie_cows_1dud3qj,4,22,7.3333,11.0,3,1.5,0.5,0.3333,0.5,...,9.7998,8.9615,10.2759,7.7304,7.5125,7.2543,6.4307,5.8481,7.0469,1.0


In [6]:
# this is how your input data for the descriptive stats, the bivariate analyses (t-test + cohen's d) and the shallow machine learning models should look like 
# -> one row per userID, MHC, all averaged CYMO feature scores
data_tid = fraud_sent.groupby(['tid']).mean()
data_tid.drop(columns=['sid'], inplace=True)
# Move the 'sid' column to the front
columns = ['fraudulent'] + [col for col in data_tid if col != 'fraudulent']
data_tid = data_tid[columns]
data_tid.head()

Unnamed: 0_level_0,fraudulent,MLS,MLC,MLT,CpS,CpT,cTT,dCC,cCT,CPC,...,WF_TV,WF_Reddit,WF_Podcast,WF_TOTAL,CD_Podcast,CD_TV,CD_Reddit,WP_TV,WP_Podcast,WP_Reddit
tid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0000Ak47_6eje92,0.0,15.0,7.8889,7.5,2.333333,1.166667,0.166667,0.2222,0.333333,0.555533,...,8.5004,9.684167,8.7325,10.203467,7.716633,7.647167,7.2409,6.622433,5.886833,7.029233
0000Ak47_8vpgaj,0.0,12.5,2.75,7.0,2.0,1.0,0.25,0.375,0.75,0.125,...,7.3023,8.68365,7.4178,9.0273,6.70315,6.69035,6.6634,5.93485,5.26065,6.518
0000Ak47_9arwqg,0.0,10.5,6.5,10.5,2.0,2.0,0.5,0.16665,0.5,0.0,...,8.49485,9.57885,8.5101,10.10875,7.67325,7.7223,7.2468,6.73245,5.93125,7.0206
0000Ak47_9b6brx,0.0,19.3,5.17107,10.229165,3.4,1.704165,0.399995,0.420005,0.970835,0.16524,...,9.133375,10.0778,9.14105,10.64159,8.05973,8.14046,7.61434,6.98332,6.17586,7.36959
0000Ak47_9ceuws,0.0,10.666667,6.5,10.666667,1.666667,1.666667,0.333333,0.166667,0.333333,0.0,...,7.725367,8.8012,7.859667,9.312867,6.968933,6.8929,6.6154,6.003667,5.408067,6.419133


In [19]:
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind
from statsmodels.stats.multitest import multipletests
from math import sqrt

# List of numerical columns to test
numerical_columns = fraud_sent.columns.drop(['tid', 'sid', 'fraudulent'])

# Separate data into fraudulent and non-fraudulent
fraudulent_data = fraud_sent[fraud_sent['fraudulent'] == 1]
non_fraudulent_data = fraud_sent[fraud_sent['fraudulent'] == 0]

# Perform t-tests
t_stat, p_values = [], []
for column in numerical_columns:
    t_stat_current, p_value_current = ttest_ind(fraudulent_data[column], non_fraudulent_data[column], nan_policy='omit')
    t_stat.append(t_stat_current)
    p_values.append(p_value_current)

# Apply Bonferroni correction
_, p_values_corrected, _, _ = multipletests(p_values, alpha=0.05, method='bonferroni')

# Calculate Cohen's d
cohens_d = []
for column in numerical_columns:
    mean_fraud = fraudulent_data[column].mean()
    mean_non_fraud = non_fraudulent_data[column].mean()
    std_fraud = fraudulent_data[column].std()
    std_non_fraud = non_fraudulent_data[column].std()
    pooled_std = sqrt((std_fraud**2 + std_non_fraud**2) / 2)
    cohens_d.append((mean_fraud - mean_non_fraud) / pooled_std)

# Create a DataFrame to display results
results_df = pd.DataFrame({
    'Feature': numerical_columns,
    'T-Statistic': t_stat,
    'P-Value (Corrected)': p_values_corrected,
    'Cohen\'s d': cohens_d
})

# Order the results by increasing p-value
results_df = results_df.sort_values(by="Cohen's d", ascending=False)

# Display the ordered results
# print(results_df)


In [20]:
# Display the results
filtered_results_df

Unnamed: 0,Feature,T-Statistic,P-Value (Corrected),Cohen's d
11,CompNomC,4.930791,3.309774e-04,0.202427
12,CompNomT,4.915400,3.580710e-04,0.207143
13,CompNomS,4.361935,5.209352e-03,0.183138
14,NPPreMod,4.909870,3.683158e-04,0.176637
17,KDbase,-7.288660,1.266713e-10,-0.254381
...,...,...,...,...
383,acad_freq_90k,4.040227,2.157684e-02,0.133956
384,acad_freq_100k,4.384188,4.704349e-03,0.143801
388,acad_juilland_D,4.959023,2.863286e-04,0.163410
389,acad_carroll_D2,4.140959,1.397508e-02,0.138751
