In [16]:
import pandas as pd
import os

def sample_data(file_path, sample_fraction=0.1, chunk_size=500000):
    diagnosed_chunks = []
    control_chunks = []
    total_rows = 0
    
    # Determine the total number of rows in the file
    for chunk in pd.read_csv(file_path, chunksize=chunk_size):
        total_rows += len(chunk)
    
    half_rows = total_rows // 2
    diagnosed_rows = int(half_rows * sample_fraction)
    control_rows = int(half_rows * sample_fraction)

    diagnosed_sampled = 0
    control_sampled = 0
    
    # Function to determine the class from the first digit of tid and extract userID and postID
    def parse_tid(tid):
        parts = tid.split('_')
        if tid.startswith('1'):
            diagnosed = 'bipolar'
            user_id = '_'.join(parts[1:-1])
            post_id = parts[-1]
        else:
            diagnosed = 'control'
            user_id = parts[2]
            post_id = parts[-1]
        return diagnosed, user_id, post_id
    
    # Read in chunks and sample from each chunk
    for chunk in pd.read_csv(file_path, chunksize=chunk_size):
        chunk[['MHC', 'userID', 'postID']] = chunk['tid'].apply(lambda x: pd.Series(parse_tid(x)))
        
        if diagnosed_sampled < diagnosed_rows:
            diagnosed_chunk = chunk[chunk['MHC'] == 'bipolar']
            diagnosed_sample = diagnosed_chunk.sample(frac=sample_fraction)
            diagnosed_chunks.append(diagnosed_sample)
            diagnosed_sampled += len(diagnosed_sample)
        
        if control_sampled < control_rows:
            control_chunk = chunk[chunk['MHC'] == 'control']
            control_sample = control_chunk.sample(frac=sample_fraction)
            control_chunks.append(control_sample)
            control_sampled += len(control_sample)
        
        if diagnosed_sampled >= diagnosed_rows and control_sampled >= control_rows:
            break
    
    # Combine sampled data
    diagnosed_df = pd.concat(diagnosed_chunks)
    control_df = pd.concat(control_chunks)
    sampled_df = pd.concat([diagnosed_df, control_df])
    
    return sampled_df

# Usage
file_path = '~/Downloads/ann.bipolar-balanced-combined.csv'
sampled_data = sample_data(file_path)
print(sampled_data.head())
print(sampled_data.tail())


                                 tid  sid  MLS   MLC      MLT  CpS     CpT  \
491590           1_KaraKaraO_epoljct    0   25  12.5   8.3333    2  0.6667   
177402         1_graamatvede_ehcf03p    3    5   5.0   5.0000    1  1.0000   
184628            1_Katricide_7ypl72    9   12  12.0  12.0000    1  1.0000   
173537      1_theSideWinder7_iifzs57    2    9   9.0   9.0000    1  1.0000   
292976  1_verytiredverymerry_ght0f53    3    2   0.0   2.0000    0  0.0000   

        cTT  dCC  cCT  ...  WF_TOTAL  CD_Podcast   CD_TV  CD_Reddit   WP_TV  \
491590  0.0  0.0  0.0  ...    9.9398      7.1364  7.4528     6.7156  6.4608   
177402  0.0  0.0  0.0  ...    8.9764      7.1418  7.1352     6.5335  6.2915   
184628  1.0  1.0  1.0  ...    9.9690      7.4349  7.2690     7.1165  6.3210   
173537  0.0  0.0  0.0  ...    9.8804      7.1076  6.9337     7.1478  6.0699   
292976  0.0  0.0  0.0  ...    4.7556      3.4928  3.6271     3.4208  3.4792   

        WP_Podcast  WP_Reddit      MHC              user

In [17]:
# minor wrnagling (adding an MHC column (you would want to merge your CYMO results data from diagnosed and controls),
# dropping columns that are not relevant for this step and reorder columns for convenience
merged_complete = sampled_data
# merged_complete['MHC'] = 'bipolar'
# merged_complete[['userID', 'postID']] = merged_complete['tid'].str.rsplit('_', n=1, expand=True)
merged_complete.drop(columns=['tid'], inplace=True)
cols = merged_complete.columns.tolist()
cols = ['userID', 'postID'] + [col for col in cols if col not in ['userID', 'postID']]
merged_complete = merged_complete[cols]
merged_complete.drop(columns=['postID', 'sid'], inplace=True)
merged_complete.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_complete.drop(columns=['postID', 'sid'], inplace=True)


Unnamed: 0,userID,MLS,MLC,MLT,CpS,CpT,cTT,dCC,cCT,CPC,...,WF_Reddit,WF_Podcast,WF_TOTAL,CD_Podcast,CD_TV,CD_Reddit,WP_TV,WP_Podcast,WP_Reddit,MHC
491590,KaraKaraO,25,12.5,8.3333,2,0.6667,0.0,0.0,0.0,1.0,...,9.3224,8.1603,9.9398,7.1364,7.4528,6.7156,6.4608,5.4226,6.4989,bipolar
177402,graamatvede,5,5.0,5.0,1,1.0,0.0,0.0,0.0,0.0,...,8.5419,7.7438,8.9764,7.1418,7.1352,6.5335,6.2915,5.6655,6.2926,bipolar
184628,Katricide,12,12.0,12.0,1,1.0,1.0,1.0,1.0,0.0,...,9.5586,8.4045,9.969,7.4349,7.269,7.1165,6.321,5.7762,6.9274,bipolar
173537,theSideWinder7,9,9.0,9.0,1,1.0,0.0,0.0,0.0,0.0,...,9.5669,8.1308,9.8804,7.1076,6.9337,7.1478,6.0699,5.5181,6.8447,bipolar
292976,verytiredverymerry,2,0.0,2.0,0,0.0,0.0,0.0,0.0,0.0,...,4.5821,3.827,4.7556,3.4928,3.6271,3.4208,3.4792,2.8934,3.2854,bipolar


In [19]:
# this is how your input data for the descriptive stats, the bivariate analyses (t-test + cohen's d) and the shallow machine learning models should look like 
# -> one row per userID, MHC, all averaged CYMO feature scores
merged_complete_agg = merged_complete.groupby(['userID', 'MHC']).mean()
merged_complete_agg.to_string(max_rows=20)
merged_complete_agg.to_csv('ann.merged.csv')