In [20]:
import os
import pandas as pd
import numpy as np
import re
import glob
import json
import matplotlib.pyplot as plt
import statsmodels
from scipy import stats


In [21]:
#prepare data from MMJ-Processed_data-2022_05_27-13_58-6858bbe.csv


def create_subs_df(group,ses,task,contrast):
    effect_size_maps = glob.glob(f'../../../derivatives/task_analysis_volume/first_level/sub-{group}*/ses-{ses}/task-{task}/sub-{group}*_ses-{ses}_task-{task}_rec-unco_run-1_contrast-{contrast}_effect_size.nii.gz')
    non_img_data = pd.read_csv(f"../../../sourcedata/non_imaging_data/MMJ-Processed_data-2022_05_27-13_58-6858bbe.csv",low_memory=False)

    #need to find subjects that have data for this contrast
    subs = [path.split('/sub-')[1].split('/')[0] for path in effect_size_maps if path]
    subs = ['_'.join([s for s in re.split(r'(MM|HC)', sub) if s]) for sub in subs]
    df_subs = pd.DataFrame(subs,columns=['subs'])


    #add columns for male and female, that will then be combined to create the group average 
    grouped_sex = non_img_data.groupby("IDS.CHR.Subject")["SBJ.CHR.Sex"].agg("first")
    dict_sex = grouped_sex.to_dict()
    df_subs = pd.concat([df_subs,pd.get_dummies(df_subs['subs'].map(dict_sex))],axis=1,copy=False)

    #add age, mean-centered
    grouped_age = non_img_data.groupby("IDS.CHR.Subject")["SBJ.INT.Age"].agg("first")
    dict_age = grouped_age.to_dict()
    df_subs['age'] = df_subs['subs'].map(dict_age)
    df_subs['age'] = df_subs['age'] - df_subs['age'].mean()

    #add CUDIT summed score, mean-centered
    if group == 'HC': 
        grouped_HC_baseline_cudit = non_img_data[non_img_data['SSS.CHR.Time_point'] == 'Screening'].groupby('IDS.CHR.Subject')['INV.INT.CUDIT.Summed_score'].agg("first")
        dict_HC_baseline_cudit = grouped_HC_baseline_cudit.to_dict()
        df_subs['total_cudit'] = df_subs['subs'].map(dict_HC_baseline_cudit)
        df_subs['total_cudit'] = df_subs['total_cudit'] - df_subs['total_cudit'].mean()

    else:
        if ses == 'baseline':
            dict_MM_baseline_cudit = non_img_data[non_img_data['SSS.CHR.Time_point'] == 'Baseline'].groupby('IDS.CHR.Subject')['INV.INT.CUDIT.Summed_score'].agg("first").to_dict()
            df_subs['total_cudit'] = df_subs['subs'].map(dict_MM_baseline_cudit)
            df_subs['total_cudit'] = df_subs['total_cudit'] - df_subs['total_cudit'].mean()
            df_subs['total_cudit'].fillna(0, inplace=True)

        else:
            dict_MM_1year_cudit = non_img_data[non_img_data['SSS.CHR.Time_point'] == 'One year'].groupby('IDS.CHR.Subject')['INV.INT.CUDIT.Summed_score'].agg("first").to_dict()
            df_subs['total_cudit'] = df_subs['subs'].map(dict_MM_1year_cudit)
            df_subs['total_cudit'] = df_subs['total_cudit'] - df_subs['total_cudit'].mean()

    #add frequency of THC use per month, mean-centered
    freq_dict = {'Once or more per day':7,
     '5-6 days a week':6,
     '3-4 days a week':5,
     '1-2 days a week':4,
     'Less than once a week':3,
     'Less than once every two weeks':2,
     'Less than once a month':1,
     None:0,
     }

    if group == 'HC':
        #results from screening visit (using this for consistency since CUDIT-R was also collected at screening visit)
        dict_HC_screening_THC = non_img_data[non_img_data['SSS.CHR.Time_point'] == 'Screening'].groupby('IDS.CHR.Subject')['TLF.CHR.THC.Frequency_in_month'].agg("last").to_dict()
        dict_HC_screening_THC_num = {k:freq_dict[v] for k,v in dict_HC_screening_THC.items()}
        df_subs['THC_freq_month'] = df_subs['subs'].map(dict_HC_screening_THC_num)
        df_subs['THC_freq_month'] = df_subs['THC_freq_month'] - df_subs['THC_freq_month'].mean()

    else:
        if ses == 'baseline':     
            #results from MRI visit (using this for consistency since CUDIT-R was also collected at MRI visit)
            dict_MM_MRIvisit_THC = non_img_data[non_img_data['SSS.CHR.Time_point'] == 'Baseline'].groupby('IDS.CHR.Subject')['TLF.CHR.THC.Frequency_in_month'].agg("first").to_dict()
            dict_MM_MRIvisit_THC_num = {k:freq_dict[v] for k,v in dict_MM_MRIvisit_THC.items()}
            df_subs['THC_freq_month'] = df_subs['subs'].map(dict_MM_MRIvisit_THC_num)
            df_subs['THC_freq_month'] = df_subs['THC_freq_month'] - df_subs['THC_freq_month'].mean()

        else:
            dict_MM_MRIvisit_THC = non_img_data[non_img_data['SSS.CHR.Time_point'] == 'One year'].groupby('IDS.CHR.Subject')['TLF.CHR.THC.Frequency_in_month'].agg("first").to_dict()
            dict_MM_MRIvisit_THC_num = {k:freq_dict[v] for k,v in dict_MM_MRIvisit_THC.items()}
            df_subs['THC_freq_month'] = df_subs['subs'].map(dict_MM_MRIvisit_THC_num)
            df_subs['THC_freq_month'] = df_subs['THC_freq_month'] - df_subs['THC_freq_month'].mean()
    
    return df_subs



In [22]:
#prepare matching data from nback_Accuracy_RTime and nback_Accuracy_RTime_HC

def create_merged_df(df_subs,group,ses):
    if group == 'HC':
        nback_data = pd.read_csv(f"../../../sourcedata/non_imaging_data/nback_RT_ACC/nback_Accuracy_RTime_HC.csv",low_memory=False)
        nback_data=nback_data.rename(columns = {'subject':'subs'})
    else:
        nback_data = pd.read_csv(f"../../../sourcedata/non_imaging_data/nback_RT_ACC/nback_Accuracy_RTime.csv",low_memory=False)
        nback_data=nback_data.rename(columns = {'subject':'subs'})
        if ses == 'baseline':
            nback_data=nback_data[nback_data['timepoint']=='baseline']
        else:
            nback_data=nback_data[nback_data['timepoint']=='1year']

    nback_data.drop(columns=['timepoint'], inplace = True)

    for column in nback_data.columns[1:]:
        nback_data[column] = nback_data[column] - nback_data[column].mean()
        nback_data[column].fillna(0, inplace=True)

    merged_data = pd.merge(df_subs, nback_data, on='subs')
    
    return merged_data
    

In [23]:
def create_df(group,ses,task,contrast):
    df_subs = create_subs_df(group,ses,task,contrast)
    merged_data = create_merged_df(df_subs,group,ses)
    return merged_data

In [24]:
def calc_paired_diff(group1_data,group2_data,column):
    
    subs1 = group1_data['subs']
    subs2 = group2_data['subs']
    
    subs = set(subs1).intersection(set(subs2))
    
    xdiff = [float(group2_data[group2_data['subs'] == sub][column])-float(group1_data[group1_data['subs'] == sub][column]) for sub in subs]
    
    return xdiff

In [34]:
#perform t-test 

def t_test_between_groups(group1_data,group2_data):
    dict_results={}
    
    for column in group1_data.columns[6:]:
        
        x1 = group1_data[column]
        x2 = group2_data[column]
        pop_mean = np.mean(x2)-np.mean(x1)
        
        tstat, pval = stats.ttest_ind(x1, x2, axis=0, equal_var=True, nan_policy='raise', permutations=None, alternative='two-sided')
        
        rounded_pop_mean = round(pop_mean,3)
        rounded_tstat = round(tstat,3)
        rounded_pval = round(pval,3)
        
        dict_results[column] = {'difference of means':rounded_pop_mean, 't statistic':rounded_tstat, 'p value':rounded_pval}
    
    return dict_results

In [35]:
#perform t-test 

def paired_t_test(group1_data,group2_data):
    dict_results={}
    
    for column in group1_data.columns[6:]:
        
        xdiff = calc_paired_diff(group1_data,group2_data,column)
        pop_mean = np.mean(xdiff)
        
        tstat, pval = stats.ttest_1samp(xdiff, popmean=0, nan_policy='raise', alternative='two-sided')

        rounded_pop_mean = round(pop_mean,3)
        rounded_tstat = round(tstat,3)
        rounded_pval = round(pval,3)
        
        dict_results[column] = {'mean difference':rounded_pop_mean, 't statistic':rounded_tstat, 'p value':rounded_pval}
        
    return dict_results

In [30]:
def save_dfs(group1_data,group1,ses1,group2_data,group2,ses2):
    
    #create paths to output dir if not exist
    derivatives_path = '../../../derivatives'
    nilearn_output_path = os.path.join(derivatives_path, 'behavioral', 'task-nback')
    if not os.path.isdir(nilearn_output_path):
        os.makedirs (nilearn_output_path)
        
    group1_data.to_csv(f'../../../derivatives/behavioral/task-nback/nback_group-{group1}_ses-{ses1}.csv')
    group2_data.to_csv(f'../../../derivatives/behavioral/task-nback/nback_group-{group2}_ses-{ses2}.csv')
    
    return

In [36]:
inputs = [('MM','HC','baseline','baseline'),('MM','MM','baseline','1year')]


output_dfs = {}

for group1,group2,ses1,ses2 in inputs:
    
    group1_data = create_df(group1,ses1,'nback','twoback-zeroback')
    group2_data = create_df(group2,ses2,'nback','twoback-zeroback')
    
    save_dfs(group1_data,group1,ses1,group2_data,group2,ses2)

    ttest_2samp = t_test_between_groups(group1_data,group2_data)
    
    output_dfs[f'{group1}_{ses1}_vs._{group2}_{ses2}'] = pd.DataFrame.from_dict(ttest_2samp)
    
    if group2=='MM':
        ttest_paired = paired_t_test(group1_data,group2_data)
        output_dfs[f'{group1}_{ses1}_vs._{group2}_{ses2}_(paired)'] = pd.DataFrame.from_dict(ttest_paired)
        

for title in output_dfs.keys():
    print(title)
    display(output_dfs[title])  
    output_dfs[title].to_csv(f'../../../derivatives/behavioral/task-nback/nback_ttest_comparison-{title}.csv')

MM_baseline_vs._HC_baseline


Unnamed: 0,ACC_all,RT_all_cor,ACC_0b,RT_0b_cor,ACC_2b,RT_2b_cor
difference of means,-0.002,-0.683,0.002,0.432,-0.007,-1.512
t statistic,0.06,0.05,-0.041,-0.038,0.161,0.075
p value,0.952,0.96,0.968,0.97,0.872,0.94


MM_baseline_vs._MM_1year


Unnamed: 0,ACC_all,RT_all_cor,ACC_0b,RT_0b_cor,ACC_2b,RT_2b_cor
difference of means,-0.007,-2.01,-0.002,-0.625,-0.012,-3.104
t statistic,0.205,0.168,0.044,0.056,0.338,0.195
p value,0.838,0.867,0.965,0.955,0.736,0.846


MM_baseline_vs._MM_1year_(paired)


Unnamed: 0,ACC_all,RT_all_cor,ACC_0b,RT_0b_cor,ACC_2b,RT_2b_cor
mean difference,0.016,3.774,0.019,2.235,0.013,7.272
t statistic,0.448,0.303,0.415,0.168,0.428,0.52
p value,0.657,0.764,0.681,0.868,0.672,0.607
