In [None]:
import os
import re
import glob

import pandas as pd
import numpy as np


In [None]:
def rm_CUD_baseline(subs_list):
        
    #subs to be excluded (only MM) because they had cannabis use disorder at baseline (exclusion criterium)
    excluded_subs = ['MM_014','MM_188','MM_197','MM_217','MM_228','MM_239','MM_241']
    
    #get only subjects that aren't those of any of the excluded subjects
    final_subs_list = [sub for sub in subs_list if sub not in excluded_subs]
        
    return final_subs_list
    

In [None]:
def get_paired_MM_subs():
    
    MM_subs_baseline_paths = glob.glob(f'../../../sub-MM*/ses-baseline')
    MM_subs_baseline = set(['MM_' + path.split('/')[3].split('-MM')[1] for path in MM_subs_baseline_paths])

    MM_subs_1year_paths = glob.glob(f'../../../sub-MM*/ses-1year')
    MM_subs_1year = set(['MM_' + path.split('/')[3].split('-MM')[1] for path in MM_subs_1year_paths])
    
    #divide MM subs by paired or not and remove CUD baseline subs
    MM_subs_paired = rm_CUD_baseline(list(MM_subs_baseline.intersection(MM_subs_1year)))

    return MM_subs_paired
    

In [None]:
def create_indiv_subs_df(ses, paired_MM_subs):
    
    #dataframe to add columns to for each subject  
    #of note, MM_254 and MM_301 don't have urine THC results for the 1year timepoint
    df_data = {
    'subs': paired_MM_subs,
    'session': [ses] * len(paired_MM_subs)
    }
    
    df_subs = pd.DataFrame(df_data)
    
    
    #load the non-imaging data
    non_img_data = pd.read_csv(f"../../../sourcedata/non_imaging_data/MMJ-Processed_data-2022_05_27-13_58-6858bbe.csv",low_memory=False)

    
    
    #covariates 
    
    simple_additions = [('SBJ.CHR.Sex','Sex'),("SBJ.INT.Age",'Age')]
                        
    
    for orig_name, col_name in simple_additions:
            
        dict_map = non_img_data.groupby("IDS.CHR.Subject")[orig_name].agg("first").to_dict()

        df_subs[col_name] = df_subs['subs'].map(dict_map)
    
    
    
    #cannabis metrics
    by_ses_additions = [('CUD.CHR.Diagnosis','CUD diagnosis'),('INV.INT.CUDIT.Summed_score','CUDIT summed score'),('TLF.CHR.THC.Frequency_in_month','THC frequency per month'),('URN.LGC.THC_present','Positive urine THC')]
    
    
    for orig_name, col_name in by_ses_additions:
        
        if ses == 'baseline':
            dict_map = non_img_data[non_img_data['SSS.CHR.Time_point'] == 'Baseline'].groupby('IDS.CHR.Subject')[orig_name].agg("first").to_dict()
        
        else:
            dict_map = non_img_data[non_img_data['SSS.CHR.Time_point'] == 'One year'].groupby('IDS.CHR.Subject')[orig_name].agg("first").to_dict()
            
        #simplify the CUD diagnosis
        if orig_name == 'CUD.CHR.Diagnosis':
            for sub, diagnosis in dict_map.items():
                
                if diagnosis is not None:
                    if '(0)' in diagnosis:
                        dict_map[sub] = 'no CUD diagnosis'
                    elif '(1)' in diagnosis:
                        dict_map[sub] = 'mild CUD diagnosis'
                    else:
                        dict_map[sub] = 'moderate/severe CUD diagnosis'

        df_subs[col_name] = df_subs['subs'].map(dict_map)
    

    return df_subs


In [None]:
def create_summary_table(ses, df):
    
    #explicitly specify which columns are categorical and which are numerical
    categorical_columns = ['CUD diagnosis', 'THC frequency per month', 'Positive urine THC']
    numerical_columns = ['CUDIT summed score']
    
    #initialize a list to store the formatted results
    summary_list = []

    #summary statistics for numerical columns
    for column in numerical_columns:
        median_value = df[column].median()
        q1 = df[column].quantile(0.25)
        q3 = df[column].quantile(0.75)
        iqr = iqr = f'({q1:.1f}-{q3:.1f})'
        result = f'{median_value:.1f} {iqr}'
        summary_list.append({'Items': f'{column}, median (IQR)', 'Levels': np.nan, f'{ses}': result})
        
    #summary statistics for categorical columns
    for column in categorical_columns:
        counts = df[column].value_counts()
        proportions = df[column].value_counts(normalize=True)
       
        #reorder the categories based on the custom order and fill in missing values as 0
        if column == 'THC frequency per month':
            custom_order = ['Less than once a month','Less than once every two weeks','Less than once a week',
                            '1-2 days a week','3-4 days a week','5-6 days a week','Once or more per day']
            counts = counts.reindex(custom_order, fill_value=0)
            proportions = proportions.reindex(custom_order, fill_value=0)
        
        #reorder the categories based on the custom order and fill in missing values as 0
        if column == 'CUD diagnosis':
            custom_order = ['no CUD diagnosis','mild CUD diagnosis']
            counts = counts.reindex(custom_order, fill_value=0)
            proportions = proportions.reindex(custom_order, fill_value=0)
              
        for category in counts.index:
            count = counts[category]
            percentage = proportions[category] * 100
            result = f'{count} ({percentage:.1f})'
            summary_list.append({'Items': f'{column}, n (%)', 'Levels': category, f'{ses}': result})
    
    
    #move CUDIT summed score to correct spot
    item_to_move = summary_list.pop(0)
    summary_list.insert(2, item_to_move)
    
    
    #create a DataFrame from the summary list
    summary_df = pd.DataFrame(summary_list)
    
    return summary_df

In [None]:
def create_table():
    
    #create df of all subs with relevant cannabis metrics and prepare csv to run modeling in R
    paired_MM_subs = get_paired_MM_subs()
        
    #subs_all_df_dict = defaultdict(pd.DataFrame)
    summary_dfs = []
    indiv_dfs = []
    
    #get cannabis metrics for the two sessions for the paired MMC participants
    for ses in ['baseline', '1year']:
        indiv_subs_df = create_indiv_subs_df(ses, paired_MM_subs)
        
        #create summary tables
        indiv_summary_df = create_summary_table(ses, indiv_subs_df)
        summary_dfs.append(indiv_summary_df)
        
        #format individual df for csv
        indiv_dfs.append(indiv_subs_df)
        
    #make shared summary statistics dataframe and store as a csv
    summary_df = summary_dfs[0].iloc[:, :2].copy()
    
    for indiv_summary_df in summary_dfs:
        summary_df = pd.concat([summary_df, indiv_summary_df.iloc[:, 2]], axis=1)

    display(summary_df)
    summary_df.to_csv(f'../../../derivatives/demographics/cannabis_metrics_table.csv',index=False)
    
    #make a combined df and store as a csv for modeling in R
    df_combined = pd.concat(indiv_dfs, axis=0)
    df_combined.reset_index(drop=True, inplace=True)
    df_sorted = df_combined.sort_values(by=['subs','session'],ascending=[True, False])
    df_sorted.to_csv(f'../../../derivatives/demographics/cannabis_data_for_modeling.csv',index=False)

    return

In [None]:
create_table()