In [None]:
import pandas as pd
import numpy as np
from collections import defaultdict
from tableone import TableOne
import glob
import re
import os

In [None]:
def rm_CUD_baseline(subs_all):
    
    #subs to be excluded (only MM) because they had cannabis use disorder at baseline (exclusion criterium)
    excluded_subs = ['MM_014','MM_188','MM_197','MM_217','MM_228','MM_239','MM_241']
    
    #get only subjects that aren't those of any of the excluded subjects
    final_subs_all = [[sub, ses] for sub, ses in subs_all if sub not in excluded_subs]
    
    return final_subs_all
    

In [None]:
def get_all_subs():
    
    subs_1year_paths = glob.glob(f'../../../sub-*/ses-1year')
    subs_1year = [['_'.join(re.split(r'(MM|HC)', path.split('/')[3].split('-')[1])[1:]),path.split('/')[4].split('-')[1]] for path in subs_1year_paths]

    subs_baseline_paths = glob.glob(f'../../../sub-*/ses-baseline')
    subs_baseline = [['_'.join(re.split(r'(MM|HC)', path.split('/')[3].split('-')[1])[1:]),path.split('/')[4].split('-')[1]] for path in subs_baseline_paths]

    subs_all = subs_baseline+subs_1year
    
    return subs_all
    

In [None]:
def get_paired_MC_subs(subs_all):
    
    MM_dict = defaultdict(list)
    
    for sub, ses in subs_all:
        if 'MM' in sub:
            MM_dict[sub].append(ses)
    
    subs_paired_MM = []
    
    for sub, ses_list in MM_dict.items():     
        if len(ses_list)==2:
            subs_paired_MM.append([sub,'baseline'])
            
    return subs_paired_MM

In [None]:
def create_indiv_subs_df(subs_input):
    #dataframe to add columns to for each subject
    subs_input.sort()
    subs = [[item[0],item[0]+'_'+item[1],item[0].split('_')[0]+'_'+item[1]] for item in subs_input]
    df_subs=pd.DataFrame.from_records(subs,columns=['subs','sub_ses','group_ses'])
    
    #load the non-imaging data
    non_img_data = pd.read_csv(f"../../../sourcedata/non_imaging_data/MMJ-Processed_data-2022_05_27-13_58-6858bbe.csv",low_memory=False)

    simple_additions = [('SBJ.CHR.Sex','Sex'),("SBJ.INT.Age",'Age'),("SBJ.CHR.Race",'Race'),
                        ("SBJ.CHR.Ethnicity",'Ethnicity'),("SBJ.CHR.Education_level",'Education level'),
                        ("SBJ.INT.Education_years",'Education years'),("SBJ.CHR.Employment_status",'Employment status'),
                        ("SBJ.CHR.Handedness",'Handedness'),('SSS.CHR.Primary_condition','Condition'),
                        ('URN.LGC.THC_present','Positive urine THC at baseline'),('URN.LGC.THC_present','Positive urine THC at one-year'),
                        ('CUD.CHR.Diagnosis','CUD diagnosis at baseline'),('CUD.CHR.Diagnosis','CUD diagnosis at one-year'),
                        ('INV.INT.CUDIT.Summed_score','CUDIT summed score at baseline'),('INV.INT.CUDIT.Summed_score','CUDIT summed score at one-year'),
                        ('TLF.CHR.THC.Frequency_in_month','THC frequency per month at baseline'),('TLF.CHR.THC.Frequency_in_month','THC frequency per month at one-year')]
    
    by_ses_additions = []
    
    
    for orig_name, col_name in simple_additions:
            
        if 'at one-year' in col_name:
            dict_map = non_img_data[non_img_data['SSS.CHR.Time_point'] == 'One year'].groupby('IDS.CHR.Subject')[orig_name].agg("first").to_dict()
        
        else:
            dict_map = non_img_data.groupby("IDS.CHR.Subject")[orig_name].agg("first").to_dict()

        if orig_name == 'SBJ.CHR.Race':
            for sub, race in dict_map.items():
                if race == 'Caucasian':
                    dict_map[sub] = 'White'
                elif race == 'African American':
                    dict_map[sub] = 'Black'
                elif race == 'Asian':
                    dict_map[sub] = 'Other'
                elif race == 'Multi-racial':
                    dict_map[sub] = 'Other'
                elif race == 'Pacific Islander':
                    dict_map[sub] = 'Other'
                    
        if orig_name == 'SSS.CHR.Primary_condition':
            for sub, condition in dict_map.items():
                if condition == 'Affective Disorder (Depression/Anxiety)':
                    dict_map[sub] = 'Depression/anxiety symptoms'
                elif condition == 'Insomnia':
                    dict_map[sub] = 'Insomnia symptoms'
                elif condition == 'Pain':
                    dict_map[sub] = 'Pain symptoms'

        df_subs[col_name] = df_subs['subs'].map(dict_map)
        
    
    for orig_name, col_name in by_ses_additions:
        
        dict_HC_baseline = non_img_data[non_img_data['SSS.CHR.Time_point'] == 'Screening'].groupby('IDS.CHR.Subject')[orig_name].agg("first").to_dict()
        dict_HC_baseline = {f'{sub}_baseline':val for (sub,val) in dict_HC_baseline.items() if 'HC' in sub}
        
        dict_MM_baseline = non_img_data[non_img_data['SSS.CHR.Time_point'] == 'Baseline'].groupby('IDS.CHR.Subject')[orig_name].agg("first").to_dict()
        dict_MM_baseline = {f'{sub}_baseline':val for (sub,val) in dict_MM_baseline.items() if 'MM' in sub}

        dict_MM_1year = non_img_data[non_img_data['SSS.CHR.Time_point'] == 'One year'].groupby('IDS.CHR.Subject')[orig_name].agg("first").to_dict()
        dict_MM_1year = {f'{sub}_1year':val for (sub,val) in dict_MM_1year.items() if 'MM' in sub}
        
        dict_all = {**dict_HC_baseline, **dict_MM_baseline, **dict_MM_1year}
        df_subs[col_name] = df_subs['sub_ses'].map(dict_all)


    df_subs['Employment status'].replace({'self':'Self'},regex=True,inplace=True)
    
    return df_subs


In [None]:
def save_table1(df_subs,include_pval,paired):
    
    #this only includes the demographics
    columns = ['Sex', 'Age', 'Race', 'Ethnicity', 'Education years', 'Condition', 'Handedness']
    categorical = ['Sex', 'Race', 'Ethnicity', 'Condition', 'Handedness']

    groupby = ['group_ses']
    labels={'HC_baseline': 'HC baseline','MM_baseline': 'MC baseline','MM_1year': 'MC one-year'}
    
    if include_pval:
        mytable = TableOne(df_subs, columns=columns, categorical=categorical, groupby=groupby, rename=labels, pval=True, htest_name=True)
    else:
        mytable = TableOne(df_subs, columns=columns, categorical=categorical, groupby=groupby, rename=labels, pval=False)

    
    #create paths to output dir if not exist
    derivatives_path = '../../../derivatives'
    nilearn_output_path = os.path.join(derivatives_path, 'demographics')
    if not os.path.isdir(nilearn_output_path):
        os.makedirs (nilearn_output_path)
    
    if include_pval:
        pval_name = 'pval'
    else:
        pval_name = 'no_pval'
        
    if paired:
        paired_name = '_with_paired'
    else: 
        paired_name = ''
        
    mytable.to_csv(f'../../../derivatives/demographics/table1_all_{pval_name}{paired_name}.csv')
    mytable_df = pd.read_csv(f'../../../derivatives/demographics/table1_all_{pval_name}{paired_name}.csv')
    
    #drop unwanted columns and rows
    mytable_df.drop('Grouped by group_ses.1', axis=1, inplace=True)
    mytable_df.drop('Grouped by group_ses', axis=1, inplace=True)
    mytable_df.drop(index=0, inplace=True)
    mytable_df.reset_index(drop=True, inplace=True)
    mytable_df.rename(columns={'Unnamed: 0': 'Items', 'Unnamed: 1': 'Levels', 'Grouped by group_ses.2': 'HC baseline', 'Grouped by group_ses.3': 'MCC one-year', 'Grouped by group_ses.4': 'MCC baseline'}, inplace=True)
    
    #reorder rows to match desired output
#     custom_order_indices = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 13, 11, 12, 14, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]
#     mytable_df = pd.concat([mytable_df.iloc[custom_order_indices]])

    #replace NaN with 0
    mytable_df['HC baseline'].fillna(0, inplace=True)
    mytable_df['MCC baseline'].fillna(0, inplace=True)
    mytable_df['MCC one-year'].fillna(0, inplace=True)
    
    
    #remove rows for simplicity
    mytable_df.drop([2,8,10,14],inplace=True)
    
    
    if include_pval and not paired:
        mytable_df.rename(columns={'Grouped by group_ses.5': 'P-values', 'Grouped by group_ses.6': 'H-test'}, inplace=True)
        mytable_df = mytable_df[['Items', 'Levels', 'HC baseline','MCC baseline', 'MCC one-year','P-values', 'H-test']]

    elif not include_pval and paired:
        mytable_df.rename(columns={'Grouped by group_ses.5': 'MCC paired'}, inplace=True)
        mytable_df['MCC paired'].fillna(0, inplace=True)
        mytable_df = mytable_df[['Items', 'Levels', 'HC baseline','MCC baseline','MCC one-year','MCC paired']]

    elif include_pval and paired:
        mytable_df.rename(columns={'Grouped by group_ses.5': 'MCC paired'}, inplace=True)
        mytable_df.rename(columns={'Grouped by group_ses.6': 'P-values', 'Grouped by group_ses.7': 'H-test'}, inplace=True)
        mytable_df['MCC paired'].fillna(0, inplace=True)
        mytable_df = mytable_df[['Items', 'Levels', 'HC baseline','MCC baseline', 'MCC one-year','MCC paired','P-values', 'H-test']]

    else:
        mytable_df = mytable_df[['Items', 'Levels', 'HC baseline','MCC baseline', 'MCC one-year']]

    
    mytable_df.to_csv(f'../../../derivatives/demographics/table1_all_{pval_name}{paired_name}.csv',index=False)
               
    display(mytable_df)
    
    return

In [None]:
def create_table(include_pval, paired):
    subs_all = get_all_subs()
    final_subs_all = rm_CUD_baseline(subs_all)
    df_subs = create_indiv_subs_df(final_subs_all)
    if paired:
        paired_subs_all = get_paired_MC_subs(final_subs_all)
        paired_subs_df = create_indiv_subs_df(paired_subs_all)
        paired_subs_df['group_ses'] =  'MM_paired'
        df_subs = pd.concat([df_subs, paired_subs_df], axis=0)
        df_subs.reset_index(drop=True, inplace=True)
    save_table1(df_subs,include_pval,paired)
    return

In [None]:
#note p-value and paired can't go together since there is no comparison as there is only one set of paired subjects 
include_pval=False
paired=True

create_table(include_pval,paired)

