In [13]:
import pandas as pd
import os

# Load in chunks for when you have a large CSV file
chunk_size = 500000  # This can be adjusted based on your memory on your machine
chunks = []
for chunk in pd.read_csv('~/code/urop/reddit/data/2019_output/ann-combined.csv', chunksize=chunk_size):
    chunks.append(chunk)  

data_sentence = pd.concat(chunks)

# Extract the class label from the first character of the 'tid' and create a new 'fraudulent' column
def extract_label(tid):
    if tid[0].isdigit():
        return int(tid[0])
    else:
        return None

data_sentence['fraudulent'] = data_sentence['tid'].astype(str).apply(extract_label)

# Drop rows where the fraudulent label could not be determined
fraud_sent = data_sentence.dropna(subset=['fraudulent'])

# Display the first few rows of the fraud_sent DataFrame
fraud_sent.head()


Unnamed: 0,tid,sid,MLS,MLC,MLT,CpS,CpT,cTT,dCC,cCT,...,WF_Reddit,WF_Podcast,WF_TOTAL,CD_Podcast,CD_TV,CD_Reddit,WP_TV,WP_Podcast,WP_Reddit,fraudulent
0,1_TwinkleMcFabulous_xg014s,0,26,6.5,13.0,4,2.0,0.0,0.25,0.5,...,9.8142,8.6662,10.3114,7.6895,7.7033,7.3586,6.7356,5.9364,7.1347,1
1,1_TwinkleMcFabulous_xg014s,1,1,0.0,1.0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,1_TwinkleMcFabulous_wiub0i,0,26,5.2,26.0,5,5.0,1.0,0.4,2.0,...,10.2036,8.8894,10.6714,8.0315,8.2333,7.6953,7.1395,6.2445,7.4719,1
3,1_TwinkleMcFabulous_wiub0i,1,31,7.75,15.5,4,2.0,0.5,0.25,0.5,...,10.3798,9.2728,10.8583,8.3372,8.2661,7.9107,7.1645,6.3989,7.6734,1
4,1_TwinkleMcFabulous_wb878p,0,22,5.5,11.0,4,2.0,0.5,0.5,1.0,...,9.8744,8.8068,10.3546,7.7493,7.9091,7.4039,6.834,5.9707,7.1932,1


In [14]:
# # here I had another df called "labels" with two columns: "tid" and "fraudulent", which I join with the original data to add the class labels 
# file_path = '/Users/daniel24/Documents/0_Exaia/Research/FraudDetection/Data/fake_job_labels.csv'  # Replace with your file path
# labels = pd.read_csv(file_path)
# fraud_sent = pd.merge(data_sentence, labels, on='tid', how='inner')

In [15]:
# so this how your CYMO data look like 
fraud_sent.head()

Unnamed: 0,tid,sid,MLS,MLC,MLT,CpS,CpT,cTT,dCC,cCT,...,WF_Reddit,WF_Podcast,WF_TOTAL,CD_Podcast,CD_TV,CD_Reddit,WP_TV,WP_Podcast,WP_Reddit,fraudulent
0,1_TwinkleMcFabulous_xg014s,0,26,6.5,13.0,4,2.0,0.0,0.25,0.5,...,9.8142,8.6662,10.3114,7.6895,7.7033,7.3586,6.7356,5.9364,7.1347,1
1,1_TwinkleMcFabulous_xg014s,1,1,0.0,1.0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,1_TwinkleMcFabulous_wiub0i,0,26,5.2,26.0,5,5.0,1.0,0.4,2.0,...,10.2036,8.8894,10.6714,8.0315,8.2333,7.6953,7.1395,6.2445,7.4719,1
3,1_TwinkleMcFabulous_wiub0i,1,31,7.75,15.5,4,2.0,0.5,0.25,0.5,...,10.3798,9.2728,10.8583,8.3372,8.2661,7.9107,7.1645,6.3989,7.6734,1
4,1_TwinkleMcFabulous_wb878p,0,22,5.5,11.0,4,2.0,0.5,0.5,1.0,...,9.8744,8.8068,10.3546,7.7493,7.9091,7.4039,6.834,5.9707,7.1932,1


In [16]:
# this is how your input data for the descriptive stats, the bivariate analyses (t-test + cohen's d) and the shallow machine learning models should look like 
# -> one row per userID, MHC, all averaged CYMO feature scores
data_tid = fraud_sent.groupby(['tid']).mean()
data_tid.drop(columns=['sid'], inplace=True)
# Move the 'sid' column to the front
columns = ['fraudulent'] + [col for col in data_tid if col != 'fraudulent']
data_tid = data_tid[columns]
data_tid.head()

Unnamed: 0_level_0,fraudulent,MLS,MLC,MLT,CpS,CpT,cTT,dCC,cCT,CPC,...,WF_TV,WF_Reddit,WF_Podcast,WF_TOTAL,CD_Podcast,CD_TV,CD_Reddit,WP_TV,WP_Podcast,WP_Reddit
tid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0_0xPolygon_jnpln_ChillCaptain_125u6y6,0.0,11.714286,5.357143,6.857143,1.857143,1.0,0.214286,0.285714,0.5,0.25,...,7.564857,9.0904,7.953114,9.542586,7.056743,6.790729,6.791971,5.883471,5.378414,6.560543
0_100pushups_Backupaccount524_JJWentMMA_4rg22r,0.0,10.333333,5.0,7.833333,2.333333,1.833333,0.5,0.333333,0.833333,0.1111,...,8.535967,9.6268,8.7613,10.201633,7.6758,7.501933,7.120467,6.490067,5.865433,6.8953
0_100thieves_MeringueWhich9353_Keito1000_jxdaxe,0.0,13.5,4.5,6.75,3.0,1.5,0.5,0.3333,0.5,0.3333,...,8.95385,10.30935,9.27455,10.81025,8.0049,7.8212,7.61835,6.69435,6.0772,7.3611
0_10s_gamerccxxi_General-Writing1764_1diizfy,0.0,8.333333,4.833333,8.333333,1.666667,1.666667,0.666667,0.333333,0.666667,0.0,...,8.175967,8.990867,8.231967,9.4775,7.0906,7.1247,6.7177,6.077233,5.362367,6.5177
0_13sentinels_ichigo_wildblossom_Fun-Wash-8858_yq75o0,0.0,16.0,4.708325,10.5,2.25,1.5,0.375,0.166675,0.375,0.041675,...,6.154325,7.007325,6.202,7.3643,5.482075,5.4677,5.320375,4.74335,4.24955,5.16705


In [17]:
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind
from statsmodels.stats.multitest import multipletests
from math import sqrt

# List of numerical columns to test
numerical_columns = fraud_sent.columns.drop(['tid', 'sid', 'fraudulent'])

# Separate data into fraudulent and non-fraudulent
fraudulent_data = fraud_sent[fraud_sent['fraudulent'] == 1]
non_fraudulent_data = fraud_sent[fraud_sent['fraudulent'] == 0]

# Perform t-tests
t_stat, p_values = [], []
for column in numerical_columns:
    t_stat_current, p_value_current = ttest_ind(fraudulent_data[column], non_fraudulent_data[column], nan_policy='omit')
    t_stat.append(t_stat_current)
    p_values.append(p_value_current)

# Apply Bonferroni correction
_, p_values_corrected, _, _ = multipletests(p_values, alpha=0.05, method='bonferroni')

# Calculate Cohen's d
cohens_d = []
for column in numerical_columns:
    mean_fraud = fraudulent_data[column].mean()
    mean_non_fraud = non_fraudulent_data[column].mean()
    std_fraud = fraudulent_data[column].std()
    std_non_fraud = non_fraudulent_data[column].std()
    pooled_std = sqrt((std_fraud**2 + std_non_fraud**2) / 2)
    cohens_d.append((mean_fraud - mean_non_fraud) / pooled_std)

# Create a DataFrame to display results
results_df = pd.DataFrame({
    'Feature': numerical_columns,
    'T-Statistic': t_stat,
    'P-Value (Corrected)': p_values_corrected,
    'Cohen\'s d': cohens_d
})

# Order the results by increasing p-value
results_df = results_df.sort_values(by="Cohen's d", ascending=False)

# Display the ordered results
# print(results_df)


In [18]:
# Display the results
results_df

Unnamed: 0,Feature,T-Statistic,P-Value (Corrected),Cohen's d
85,adjacent-overlap.binary-2-all-sent,198.290189,0.0,0.661025
97,adjacent-overlap.binary-2-fw-sent,191.639501,0.0,0.646864
133,adjacent-overlap.binary-2-argument-sent,182.716766,0.0,0.632225
127,adjacent-overlap.binary-2-pronoun-sent,170.277912,0.0,0.607849
95,adjacent-overlap.2-fw-sent,147.019505,0.0,0.580267
...,...,...,...,...
123,adjacent-overlap.pronoun-sent-div-seg,-61.177578,0.0,-0.209434
87,adjacent-overlap.cw-sent-div-seg,-71.037269,0.0,-0.237600
129,adjacent-overlap.argument-sent-div-seg,-77.708958,0.0,-0.262616
93,adjacent-overlap.fw-sent-div-seg,-83.177945,0.0,-0.286229
