In [None]:
import os
import re
import glob
from collections import defaultdict

import pandas as pd
import numpy as np


In [None]:
def rm_CUD_baseline(subs_list):
        
    #subs to be excluded (only MM) because they had cannabis use disorder at baseline (exclusion criterium)
    excluded_subs = ['MM_014','MM_188','MM_197','MM_217','MM_228','MM_239','MM_241']
    
    #get only subjects that aren't those of any of the excluded subjects
    final_subs_list = [sub for sub in subs_list if sub not in excluded_subs]
        
    return final_subs_list
    

In [None]:
def get_all_subs():
    
    #get subs for each group and timepoint
    HC_subs_baseline_paths = glob.glob(f'../../../sub-HC*/ses-baseline')
    HC_subs_baseline = ['HC_' + path.split('/')[3].split('-HC')[1] for path in HC_subs_baseline_paths]
    
    MM_subs_baseline_paths = glob.glob(f'../../../sub-MM*/ses-baseline')
    MM_subs_baseline = set(['MM_' + path.split('/')[3].split('-MM')[1] for path in MM_subs_baseline_paths])

    MM_subs_1year_paths = glob.glob(f'../../../sub-MM*/ses-1year')
    MM_subs_1year = set(['MM_' + path.split('/')[3].split('-MM')[1] for path in MM_subs_1year_paths])
    
    #additionally specify which MM subs are paired and remove CUD baseline subs from all MM lists
    MM_subs_paired = rm_CUD_baseline(list(MM_subs_baseline.intersection(MM_subs_1year)))
    
    MM_subs_baseline = rm_CUD_baseline(list(MM_subs_baseline))
            
    #put all subs lists together as a dictionary
    subs_all_dict = {'HC_baseline':HC_subs_baseline, 'MM_baseline':MM_subs_baseline, 'MM_paired':MM_subs_paired}

    return subs_all_dict
    

In [None]:
def create_indiv_subs_df(group, subs_list):
    
    #dataframe to add columns to for each subject
    ses = group.split('_')[-1]
    
    df_data = {
    'subs': subs_list,
    'session': [ses] * len(subs_list)
    }
    
    df_subs = pd.DataFrame(df_data)
    
    
    #load the non-imaging data
    non_img_data = pd.read_csv(f"../../../sourcedata/non_imaging_data/MMJ-Processed_data-2022_05_27-13_58-6858bbe.csv",low_memory=False)

    simple_additions = [('SBJ.CHR.Sex','Sex'),("SBJ.INT.Age",'Age')]
                        
    
    for orig_name, col_name in simple_additions:
            
        dict_map = non_img_data.groupby("IDS.CHR.Subject")[orig_name].agg("first").to_dict()
        df_subs[col_name] = df_subs['subs'].map(dict_map)
        
    
    group = group.split('_')[0]
    
    if group == 'HC':
        nback_data = pd.read_csv(f"../../../sourcedata/non_imaging_data/nback_RT_ACC/nback_Accuracy_RTime_HC.csv",low_memory=False)
        for col in list(nback_data.columns):
            if col not in ['subject','timepoint']:
                dict_map = dict(zip(nback_data['subject'], nback_data[col]))
                df_subs[col] = df_subs['subs'].map(dict_map)
                
    else:
        nback_data = pd.read_csv(f"../../../sourcedata/non_imaging_data/nback_RT_ACC/nback_Accuracy_RTime.csv",low_memory=False)
        
        if ses == 'baseline':
            nback_data=nback_data[nback_data['timepoint']=='baseline']
        else:
            nback_data=nback_data[nback_data['timepoint']=='1year']
            
        for col in list(nback_data.columns):
            if col not in ['subject','timepoint']:
                dict_map = dict(zip(nback_data['subject'], nback_data[col]))
                df_subs[col] = df_subs['subs'].map(dict_map)
                
                
    return df_subs


In [None]:
def create_table():
        
    #create df of all subs with relevant demographics
    subs_all_dict = get_all_subs()
    
    groups = subs_all_dict.keys()
    
    indiv_dfs = defaultdict(list)
    
    
    for group in groups:
         #get individual dataframes per group with relevant demographics
        if group == 'MM_paired':
            for subgroup in ['MM_baseline','MM_1year']:
                indiv_subs_df = create_indiv_subs_df(subgroup, subs_all_dict[group])
                indiv_dfs['MM_paired_comparison'].append(indiv_subs_df)
        else:
            indiv_subs_df = create_indiv_subs_df(group, subs_all_dict[group])
            indiv_dfs['HC_MM_baseline_comparison'].append(indiv_subs_df)
        
        
    for comparison in indiv_dfs.keys():
        #make a combined df and store as a csv for modeling in R
        df_combined = pd.concat(indiv_dfs[comparison], axis=0)
        df_combined.reset_index(drop=True, inplace=True)
        df_sorted = df_combined.sort_values(by=['subs','session'],ascending=[True, False])
        df_sorted.to_csv(f'../../../derivatives/behavioral/nback_data_for_modeling_{comparison}.csv',index=False)
        
        print(comparison)
        display(df_sorted)
    
    return

In [None]:
create_table()