In [3]:
import pandas as pd
import os

# O load in chunks here for when you have a large CSV file here
chunk_size = 500000  # This can be adjusted based on your memory on your machine
chunks = []
for chunk in pd.read_csv('~/code/urop/reddit/data/2019_output/ann-combined.csv', chunksize=chunk_size):
    chunks.append(chunk)  

merged_complete = pd.concat(chunks)

# this is how the data come out of CYMO
merged_complete.head()

Unnamed: 0,tid,sid,MLS,MLC,MLT,CpS,CpT,cTT,dCC,cCT,...,WF_TV,WF_Reddit,WF_Podcast,WF_TOTAL,CD_Podcast,CD_TV,CD_Reddit,WP_TV,WP_Podcast,WP_Reddit
0,1_TwinkleMcFabulous_xg014s,0,26,6.5,13.0,4,2.0,0.0,0.25,0.5,...,8.5683,9.8142,8.6662,10.3114,7.6895,7.7033,7.3586,6.7356,5.9364,7.1347
1,1_TwinkleMcFabulous_xg014s,1,1,0.0,1.0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1_TwinkleMcFabulous_wiub0i,0,26,5.2,26.0,5,5.0,1.0,0.4,2.0,...,9.1037,10.2036,8.8894,10.6714,8.0315,8.2333,7.6953,7.1395,6.2445,7.4719
3,1_TwinkleMcFabulous_wiub0i,1,31,7.75,15.5,4,2.0,0.5,0.25,0.5,...,9.1174,10.3798,9.2728,10.8583,8.3372,8.2661,7.9107,7.1645,6.3989,7.6734
4,1_TwinkleMcFabulous_wb878p,0,22,5.5,11.0,4,2.0,0.5,0.5,1.0,...,8.8924,9.8744,8.8068,10.3546,7.7493,7.9091,7.4039,6.834,5.9707,7.1932


In [34]:
# Add an MHC column indicating the condition
merged_complete['MHC'] = 'bipolar'

# Split the 'tid' into 'diagnosed', 'userID', and 'postID'
merged_complete[['diagnosed', 'userID', 'postID']] = merged_complete['tid'].str.extract(r'(\d)_(.+)_(.+)')

# Drop the 'tid' column
merged_complete.drop(columns=['tid'], inplace=True)

# Reorder columns for convenience
cols = merged_complete.columns.tolist()
cols = ['diagnosed', 'userID', 'postID'] + [col for col in cols if col not in ['diagnosed', 'userID', 'postID']]
merged_complete = merged_complete[cols]

# Drop 'postID' and 'sid' columns as they are not relevant for this step
merged_complete.drop(columns=['postID', 'sid'], inplace=True)

# Display the first few rows of the modified DataFrame
merged_complete.head()


Unnamed: 0,userID,MLS,MLC,MLT,CpS,CpT,cTT,dCC,cCT,CPC,...,WF_Reddit,WF_Podcast,WF_TOTAL,CD_Podcast,CD_TV,CD_Reddit,WP_TV,WP_Podcast,WP_Reddit,MHC
0,Winter_Result_8734,14,7.0,7.0,2,1.0,0.0,0.0,0.0,0.5,...,9.3583,7.7811,9.8147,6.5324,6.6147,6.5503,5.7089,4.9822,6.361,bipolar
1,Winter_Result_8734,12,6.0,6.0,2,1.0,0.0,0.0,0.0,0.5,...,8.3093,7.2072,8.6135,6.2854,6.2007,6.2256,5.3935,4.8545,6.0758,bipolar
2,Winter_Result_8734,16,16.0,8.0,1,0.5,0.0,0.0,0.0,1.0,...,7.3584,6.8945,7.8932,6.0995,6.0041,5.4897,5.2649,4.7415,5.2891,bipolar
3,Winter_Result_8734,22,7.3333,7.3333,3,1.0,0.0,0.0,0.0,0.6667,...,8.9395,8.0786,9.5084,7.2884,7.2631,6.754,6.4204,5.6665,6.5383,bipolar
4,Winter_Result_8734,12,12.0,12.0,1,1.0,0.0,0.0,0.0,0.0,...,9.8824,8.7722,10.3,7.6236,7.6063,7.3673,6.6743,5.9335,7.1131,bipolar


In [35]:
# this is how your input data for the descriptive stats, the bivariate analyses (t-test + cohen's d) and the shallow machine learning models should look like 
# -> one row per userID, MHC, all averaged CYMO feature scores
merged_complete_agg = merged_complete.groupby(['userID', 'MHC']).mean()
merged_complete_agg.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,MLS,MLC,MLT,CpS,CpT,cTT,dCC,cCT,CPC,CPT,...,WF_TV,WF_Reddit,WF_Podcast,WF_TOTAL,CD_Podcast,CD_TV,CD_Reddit,WP_TV,WP_Podcast,WP_Reddit
userID,MHC,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Winter_Result_8734,bipolar,12.211921,5.373305,8.580463,2.066225,1.45585,0.352097,0.278492,0.634326,0.168968,0.189075,...,8.312054,9.288419,8.253016,9.852244,7.272956,7.352277,6.903538,6.387123,5.585654,6.692154
