In [15]:
import pandas as pd
import os

# Load in chunks for when you have a large CSV file
chunk_size = 500000  # This can be adjusted based on your memory on your machine
chunks = []
for chunk in pd.read_csv('~/Downloads/ann.diagnosed-data.cymo.csv', chunksize=chunk_size):
    chunks.append(chunk)  

data_sentence = pd.concat(chunks)

# Extract the class label from the first character of the 'tid' and create a new 'fraudulent' column
def extract_label(tid):
    if tid[0].isdigit():
        return int(tid[0])
    else:
        return None

data_sentence['fraudulent'] = data_sentence['tid'].astype(str).apply(extract_label)

# Drop rows where the fraudulent label could not be determined
fraud_sent = data_sentence.dropna(subset=['fraudulent'])

# Display the first few rows of the fraud_sent DataFrame
fraud_sent.head()


Unnamed: 0,tid,sid,MLS,MLC,MLT,CpS,CpT,cTT,dCC,cCT,...,WF_Reddit,WF_Podcast,WF_TOTAL,CD_Podcast,CD_TV,CD_Reddit,WP_TV,WP_Podcast,WP_Reddit,fraudulent
0,1_commie_cows_1dud3qj,0,6,6.0,6.0,1,1.0,0.0,0.0,0.0,...,11.6128,10.6873,12.1646,9.0733,9.0621,8.7048,7.7085,6.8104,8.4486,1
1,1_commie_cows_1dud3qj,1,7,3.5,7.0,2,2.0,0.0,0.0,0.0,...,8.1331,7.0915,8.5288,6.3807,6.3269,6.0778,5.6476,4.9589,5.9101,1
2,1_commie_cows_1dud3qj,2,33,4.125,16.5,8,4.0,1.0,0.75,3.0,...,9.8301,9.0412,10.3439,7.9534,7.8949,7.4688,6.802,6.116,7.2317,1
3,1_commie_cows_1dud3qj,3,10,10.0,10.0,1,1.0,1.0,1.0,1.0,...,10.1892,9.0651,10.5977,8.1009,7.8486,7.8375,6.6651,6.1558,7.5704,1
4,1_commie_cows_1dud3qj,4,22,7.3333,11.0,3,1.5,0.5,0.3333,0.5,...,9.7998,8.9615,10.2759,7.7304,7.5125,7.2543,6.4307,5.8481,7.0469,1


In [16]:
# # here I had another df called "labels" with two columns: "tid" and "fraudulent", which I join with the original data to add the class labels 
# file_path = '/Users/daniel24/Documents/0_Exaia/Research/FraudDetection/Data/fake_job_labels.csv'  # Replace with your file path
# labels = pd.read_csv(file_path)
# fraud_sent = pd.merge(data_sentence, labels, on='tid', how='inner')

In [17]:
# this is how your input data for the descriptive stats, the bivariate analyses (t-test + cohen's d) and the shallow machine learning models should look like 
# -> one row per userID, MHC, all averaged CYMO feature scores
data_tid = fraud_sent.groupby(['tid']).mean()
data_tid.drop(columns=['sid'], inplace=True)
# Move the 'sid' column to the front
columns = ['fraudulent'] + [col for col in data_tid if col != 'fraudulent']
data_tid = data_tid[columns]
data_tid.head()

Unnamed: 0_level_0,fraudulent,MLS,MLC,MLT,CpS,CpT,cTT,dCC,cCT,CPC,...,WF_TV,WF_Reddit,WF_Podcast,WF_TOTAL,CD_Podcast,CD_TV,CD_Reddit,WP_TV,WP_Podcast,WP_Reddit
tid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1_-dogtopus-_11loutn,1.0,15.730769,6.180312,10.525642,2.923077,2.096154,0.641027,0.433792,1.089746,0.204215,...,8.983015,10.016731,8.736065,10.514185,7.735242,7.848954,7.416542,6.783862,5.964604,7.179646
1_-dogtopus-_13c1yl4,1.0,17.875,7.281238,14.9375,2.75,2.25,0.625,0.312487,0.875,0.177075,...,8.078625,9.347225,8.250875,9.787175,7.329713,7.246125,7.017075,6.28785,5.624937,6.813487
1_-dogtopus-_13k6o6h,1.0,12.5,7.0,8.375,1.75,1.0625,0.166662,0.145838,0.291662,0.229162,...,8.030487,9.40215,8.227063,9.867775,7.345087,7.238725,7.0132,6.355288,5.684275,6.816225
1_-dogtopus-_14nirev,1.0,14.352941,7.124512,7.784312,2.352941,1.176471,0.294118,0.285294,0.426471,0.340194,...,8.322,9.434382,8.190541,9.901576,7.254135,7.401112,6.920065,6.460871,5.623006,6.685076
1_-dogtopus-_14scbwt,1.0,24.0,5.509256,16.555556,4.666667,3.277778,0.677778,0.434567,1.822222,0.127778,...,8.938633,10.007622,9.072889,10.542556,8.019678,7.954689,7.546867,6.839722,6.124389,7.329733


In [20]:
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind
from statsmodels.stats.multitest import multipletests
from math import sqrt

# List of numerical columns to test
numerical_columns = fraud_sent.columns.drop(['tid', 'sid', 'fraudulent'])

# Separate data into fraudulent and non-fraudulent
fraudulent_data = fraud_sent[fraud_sent['fraudulent'] == 1]
non_fraudulent_data = fraud_sent[fraud_sent['fraudulent'] == 0]

# Perform t-tests
t_stat, p_values = [], []
for column in numerical_columns:
    t_stat_current, p_value_current = ttest_ind(fraudulent_data[column], non_fraudulent_data[column], nan_policy='omit')
    t_stat.append(t_stat_current)
    p_values.append(p_value_current)

# Apply Bonferroni correction
_, p_values_corrected, _, _ = multipletests(p_values, alpha=0.05, method='bonferroni')

# Calculate Cohen's d
cohens_d = []
for column in numerical_columns:
    mean_fraud = fraudulent_data[column].mean()
    mean_non_fraud = non_fraudulent_data[column].mean()
    std_fraud = fraudulent_data[column].std()
    std_non_fraud = non_fraudulent_data[column].std()
    pooled_std = sqrt((std_fraud**2 + std_non_fraud**2) / 2)
    cohens_d.append((mean_fraud - mean_non_fraud) / pooled_std)

# Create a DataFrame to display results
results_df = pd.DataFrame({
    'Feature': numerical_columns,
    'T-Statistic': t_stat,
    'P-Value (Corrected)': p_values_corrected,
    'Cohen\'s d': cohens_d
})

# Order the results by increasing p-value
results_df = results_df.sort_values(by="Cohen's d", ascending=False)

print(fraudulent_data.head())
print(non_fraudulent_data.head())
# Display the ordered results
# print(results_df)


                     tid  sid  MLS      MLC   MLT  CpS  CpT  cTT     dCC  cCT  \
0  1_commie_cows_1dud3qj    0    6   6.0000   6.0    1  1.0  0.0  0.0000  0.0   
1  1_commie_cows_1dud3qj    1    7   3.5000   7.0    2  2.0  0.0  0.0000  0.0   
2  1_commie_cows_1dud3qj    2   33   4.1250  16.5    8  4.0  1.0  0.7500  3.0   
3  1_commie_cows_1dud3qj    3   10  10.0000  10.0    1  1.0  1.0  1.0000  1.0   
4  1_commie_cows_1dud3qj    4   22   7.3333  11.0    3  1.5  0.5  0.3333  0.5   

   ...  WF_Reddit  WF_Podcast  WF_TOTAL  CD_Podcast   CD_TV  CD_Reddit  \
0  ...    11.6128     10.6873   12.1646      9.0733  9.0621     8.7048   
1  ...     8.1331      7.0915    8.5288      6.3807  6.3269     6.0778   
2  ...     9.8301      9.0412   10.3439      7.9534  7.8949     7.4688   
3  ...    10.1892      9.0651   10.5977      8.1009  7.8486     7.8375   
4  ...     9.7998      8.9615   10.2759      7.7304  7.5125     7.2543   

    WP_TV  WP_Podcast  WP_Reddit  fraudulent  
0  7.7085      6.8104

In [19]:
# Display the results
results_df

Unnamed: 0,Feature,T-Statistic,P-Value (Corrected),Cohen's d
0,MLS,,,
1,MLC,,,
2,MLT,,,
3,CpS,,,
4,CpT,,,
...,...,...,...,...
399,CD_TV,,,
400,CD_Reddit,,,
401,WP_TV,,,
402,WP_Podcast,,,
