In [None]:
import pandas as pd
import numpy as np
from collections import defaultdict
from tableone import TableOne
import glob
import re
import os

In [None]:
def rm_CUD_baseline(subs_all):
    
    #subs to be excluded (only MM) because they had cannabis use disorder at baseline (exclusion criterium)
    excluded_subs = ['MM_014','MM_188','MM_197','MM_217','MM_228','MM_239','MM_241']
    
    #get only subjects that aren't those of any of the excluded subjects
    final_subs_all = [[sub, ses] for sub, ses in subs_all if sub not in excluded_subs]

    return final_subs_all
    

In [None]:
def get_all_subs():
    
    subs_1year_paths = glob.glob(f'../../../sub-*/ses-1year')
    subs_1year = [['_'.join(re.split(r'(MM|HC)', path.split('/')[3].split('-')[1])[1:]),path.split('/')[4].split('-')[1]] for path in subs_1year_paths]

    subs_baseline_paths = glob.glob(f'../../../sub-*/ses-baseline')
    subs_baseline = [['_'.join(re.split(r'(MM|HC)', path.split('/')[3].split('-')[1])[1:]),path.split('/')[4].split('-')[1]] for path in subs_baseline_paths]

    subs_all = subs_baseline+subs_1year
    
    return subs_all

In [None]:
def get_paired_MC_subs(subs_all):
    
    MM_dict = defaultdict(list)
    
    for sub, ses in subs_all:
        if 'MM' in sub:
            MM_dict[sub].append(ses)
                
    subs_paired_MM = []
    
    for sub, ses_list in MM_dict.items():     
        if len(ses_list)==2:
            subs_paired_MM.append([sub,'baseline'])
            subs_paired_MM.append([sub,'1year'])
            
            
        
    return subs_paired_MM

In [None]:
def create_indiv_subs_df(subs_input):
    #dataframe to add columns to for each subject
    subs_input.sort()
    subs = [[item[0],item[0]+'_'+item[1],item[1]] for item in subs_input]
    df_subs=pd.DataFrame.from_records(subs,columns=['subs','sub_ses','ses'])
    
    #load the non-imaging data
    non_img_data = pd.read_csv(f"../../../sourcedata/non_imaging_data/MMJ-Processed_data-2022_05_27-13_58-6858bbe.csv",low_memory=False)

       
    by_ses_additions = [('CUD.CHR.Diagnosis','CUD diagnosis'),('INV.INT.CUDIT.Summed_score','CUDIT summed score'),('TLF.CHR.THC.Frequency_in_month','THC frequency per month'),('URN.LGC.THC_present','Positive urine THC')]
    
    
    
    for orig_name, col_name in by_ses_additions:
             
        dict_MM_baseline = non_img_data[non_img_data['SSS.CHR.Time_point'] == 'Baseline'].groupby('IDS.CHR.Subject')[orig_name].agg("first").to_dict()
        dict_MM_baseline = {f'{sub}_baseline':val for (sub,val) in dict_MM_baseline.items() if 'MM' in sub}
        
        dict_MM_1year = non_img_data[non_img_data['SSS.CHR.Time_point'] == 'One year'].groupby('IDS.CHR.Subject')[orig_name].agg("first").to_dict()
        dict_MM_1year = {f'{sub}_1year':val for (sub,val) in dict_MM_1year.items() if 'MM' in sub}

        
        dict_MM = {**dict_MM_baseline, **dict_MM_1year}
        df_subs[col_name] = df_subs['sub_ses'].map(dict_MM)#.fillna(df_subs[col_name])

    
    return df_subs


In [None]:
def save_table1(df_subs,include_pval):
    

    columns = ['CUD diagnosis','CUDIT summed score','THC frequency per month','Positive urine THC']
    categorical = ['CUD diagnosis','THC frequency per month','Positive urine THC']
    
    groupby = ['ses']
    labels={'MM_baseline': 'MC baseline','MM_1year': 'MC one-year'}
    
    if include_pval:
        mytable = TableOne(df_subs, columns=columns, categorical=categorical, groupby=groupby, rename=labels, pval=True, htest_name=True)
    else:
        mytable = TableOne(df_subs, columns=columns, categorical=categorical, groupby=groupby, rename=labels, pval=False)

    #create paths to output dir if not exist
    derivatives_path = '../../../derivatives'
    nilearn_output_path = os.path.join(derivatives_path, 'demographics')
    if not os.path.isdir(nilearn_output_path):
        os.makedirs (nilearn_output_path)
    
    if include_pval:
        mytable.to_csv('../../../derivatives/demographics/cannabis_table1_all_pval.csv')                          
        mytable_df = pd.read_csv('../../../derivatives/demographics/cannabis_table1_all_pval.csv')
        mytable_df.drop('Grouped by ses', axis=1, inplace=True)    #remove missing
        mytable_df.drop('Grouped by ses.1', axis=1, inplace=True)  #remove overall
        mytable_df = mytable_df[['Unnamed: 0','Unnamed: 1','Grouped by ses.3','Grouped by ses.2','Grouped by ses.4', 'Grouped by ses.5']] #swap 1 year and baseline columns
        
        #rename for more understable output
        mytable_df.rename(columns={'Unnamed: 0': 'Items', 'Unnamed: 1': 'Levels', 'Grouped by ses.3': 'MCC baseline', 'Grouped by ses.2': 'MCC one-year', 'Grouped by ses.4': 'P-values', 'Grouped by ses.5': 'H-test'}, inplace=True)
        mytable_df.drop(index=0, inplace=True)
        mytable_df.reset_index(drop=True, inplace=True)
        
        #replace a few values with nan for simplicity
        mytable_df.at[2, 'Levels'] = np.nan
        mytable_df.at[12, 'Levels'] = np.nan
        
        #reorder to match desired output and remove some rows for simplicity
        custom_order_indices = [2,3,7,9,8,4,5,6,10,12]
        mytable_df = pd.concat([mytable_df.iloc[custom_order_indices]])
                
        #move p-value of reordered item to topmost level
        pvalue_to_move = mytable_df.loc[4, 'P-values']
        mytable_df.loc[4, 'P-values'] = np.nan
        mytable_df.loc[7, 'P-values'] = pvalue_to_move
        mytable_df.reset_index(drop=True, inplace=True)

        mytable_df['MCC baseline'].fillna(0, inplace=True)
        mytable_df['MCC one-year'].fillna(0, inplace=True)
        
        mytable_df.to_csv('../../../derivatives/demographics/cannabis_table1_all_pval.csv',index=False)
        display(mytable_df) 
            
    else:
        mytable.to_csv('../../../derivatives/demographics/cannabis_table1_no_pval.csv')
        mytable_df = pd.read_csv('../../../derivatives/demographics/cannabis_table1_no_pval.csv')
        mytable_df.drop('Grouped by ses', axis=1, inplace=True)    #remove missing
        mytable_df.drop('Grouped by ses.1', axis=1, inplace=True)  #remove overall
        mytable_df = mytable_df[['Unnamed: 0','Unnamed: 1','Grouped by ses.3','Grouped by ses.2']] #swap 1 year and baseline columns
        
        #rename for more understable output
        mytable_df.rename(columns={'Unnamed: 0': 'Items', 'Unnamed: 1': 'Levels', 'Grouped by ses.3': 'MCC baseline', 'Grouped by ses.2': 'MCC one-year'}, inplace=True)
        mytable_df.drop(index=0, inplace=True)
        mytable_df.reset_index(drop=True, inplace=True)
        
        #replace a few values with nan for simplicity
        mytable_df.at[2, 'Levels'] = np.nan
        mytable_df.at[12, 'Levels'] = np.nan
        
        #reorder to match desired output and remove some rows for simplicity
        custom_order_indices = [2,3,7,9,8,4,5,6,10,12]
        mytable_df = pd.concat([mytable_df.iloc[custom_order_indices]])
        
        mytable_df['MCC baseline'].fillna(0, inplace=True)
        mytable_df['MCC one-year'].fillna(0, inplace=True)
        
        mytable_df.to_csv('../../../derivatives/demographics/cannabis_table1_no_pval.csv',index=False)
        display(mytable_df)
        
    return

In [None]:
def create_table(include_pval):
    subs_all = get_all_subs()
    final_subs_all = rm_CUD_baseline(subs_all)
    subs_paired_MM = get_paired_MC_subs(final_subs_all)
    df_subs = create_indiv_subs_df(subs_paired_MM)
    save_table1(df_subs,include_pval)
    return

In [None]:
include_pval=False
create_table(include_pval)