In [None]:
import os
import pandas as pd
import numpy as np
import re
import glob
import json
import matplotlib.pyplot as plt
import statsmodels
from scipy import stats


In [None]:
#prepare data from MMJ-Processed_data-2022_05_27-13_58-6858bbe.csv


def create_subs_df(group,ses,task,contrast):
    effect_size_maps = glob.glob(f'../../../derivatives/task_analysis_volume/first_level/sub-{group}*/ses-{ses}/task-{task}/sub-{group}*_ses-{ses}_task-{task}_rec-unco_run-1_contrast-{contrast}_effect_size.nii.gz')
    non_img_data = pd.read_csv(f"../../../sourcedata/non_imaging_data/MMJ-Processed_data-2022_05_27-13_58-6858bbe.csv",low_memory=False)

    #need to find subjects that have data for this contrast
    subs = [path.split('/sub-')[1].split('/')[0] for path in effect_size_maps if path]
    subs = ['_'.join([s for s in re.split(r'(MM|HC)', sub) if s]) for sub in subs]
    df_subs = pd.DataFrame(subs,columns=['subs'])


    #add columns for male and female, that will then be combined to create the group average 
    grouped_sex = non_img_data.groupby("IDS.CHR.Subject")["SBJ.CHR.Sex"].agg("first")
    dict_sex = grouped_sex.to_dict()
    df_subs = pd.concat([df_subs,pd.get_dummies(df_subs['subs'].map(dict_sex))],axis=1,copy=False)

    #add age, mean-centered
    grouped_age = non_img_data.groupby("IDS.CHR.Subject")["SBJ.INT.Age"].agg("first")
    dict_age = grouped_age.to_dict()
    df_subs['age'] = df_subs['subs'].map(dict_age)
    df_subs['age'] = df_subs['age'] - df_subs['age'].mean()

    #add CUDIT summed score, mean-centered
    if group == 'HC': 
        grouped_HC_baseline_cudit = non_img_data[non_img_data['SSS.CHR.Time_point'] == 'Screening'].groupby('IDS.CHR.Subject')['INV.INT.CUDIT.Summed_score'].agg("first")
        dict_HC_baseline_cudit = grouped_HC_baseline_cudit.to_dict()
        df_subs['total_cudit'] = df_subs['subs'].map(dict_HC_baseline_cudit)
        df_subs['total_cudit'] = df_subs['total_cudit'] - df_subs['total_cudit'].mean()

    else:
        if ses == 'baseline':
            dict_MM_baseline_cudit = non_img_data[non_img_data['SSS.CHR.Time_point'] == 'Baseline'].groupby('IDS.CHR.Subject')['INV.INT.CUDIT.Summed_score'].agg("first").to_dict()
            df_subs['total_cudit'] = df_subs['subs'].map(dict_MM_baseline_cudit)
            df_subs['total_cudit'] = df_subs['total_cudit'] - df_subs['total_cudit'].mean()
            df_subs['total_cudit'].fillna(0, inplace=True)

        else:
            dict_MM_1year_cudit = non_img_data[non_img_data['SSS.CHR.Time_point'] == 'One year'].groupby('IDS.CHR.Subject')['INV.INT.CUDIT.Summed_score'].agg("first").to_dict()
            df_subs['total_cudit'] = df_subs['subs'].map(dict_MM_1year_cudit)
            df_subs['total_cudit'] = df_subs['total_cudit'] - df_subs['total_cudit'].mean()

    #add frequency of THC use per month, mean-centered
    freq_dict = {'Once or more per day':7,
     '5-6 days a week':6,
     '3-4 days a week':5,
     '1-2 days a week':4,
     'Less than once a week':3,
     'Less than once every two weeks':2,
     'Less than once a month':1,
     None:0,
     }

    if group == 'HC':
        #results from screening visit (using this for consistency since CUDIT-R was also collected at screening visit)
        dict_HC_screening_THC = non_img_data[non_img_data['SSS.CHR.Time_point'] == 'Screening'].groupby('IDS.CHR.Subject')['TLF.CHR.THC.Frequency_in_month'].agg("last").to_dict()
        dict_HC_screening_THC_num = {k:freq_dict[v] for k,v in dict_HC_screening_THC.items()}
        df_subs['THC_freq_month'] = df_subs['subs'].map(dict_HC_screening_THC_num)
        df_subs['THC_freq_month'] = df_subs['THC_freq_month'] - df_subs['THC_freq_month'].mean()

    else:
        if ses == 'baseline':     
            #results from MRI visit (using this for consistency since CUDIT-R was also collected at MRI visit)
            dict_MM_MRIvisit_THC = non_img_data[non_img_data['SSS.CHR.Time_point'] == 'Baseline'].groupby('IDS.CHR.Subject')['TLF.CHR.THC.Frequency_in_month'].agg("first").to_dict()
            dict_MM_MRIvisit_THC_num = {k:freq_dict[v] for k,v in dict_MM_MRIvisit_THC.items()}
            df_subs['THC_freq_month'] = df_subs['subs'].map(dict_MM_MRIvisit_THC_num)
            df_subs['THC_freq_month'] = df_subs['THC_freq_month'] - df_subs['THC_freq_month'].mean()

        else:
            dict_MM_MRIvisit_THC = non_img_data[non_img_data['SSS.CHR.Time_point'] == 'One year'].groupby('IDS.CHR.Subject')['TLF.CHR.THC.Frequency_in_month'].agg("first").to_dict()
            dict_MM_MRIvisit_THC_num = {k:freq_dict[v] for k,v in dict_MM_MRIvisit_THC.items()}
            df_subs['THC_freq_month'] = df_subs['subs'].map(dict_MM_MRIvisit_THC_num)
            df_subs['THC_freq_month'] = df_subs['THC_freq_month'] - df_subs['THC_freq_month'].mean()
    
    return df_subs


In [None]:
#prepare matching data from nback_Accuracy_RTime and nback_Accuracy_RTime_HC

def create_merged_df(df_subs,group,ses):
    if group == 'HC':
        nback_data = pd.read_csv(f"../../../sourcedata/non_imaging_data/nback_RT_ACC/nback_Accuracy_RTime_HC.csv",low_memory=False)
        nback_data=nback_data.rename(columns = {'subject':'subs'})
    else:
        nback_data = pd.read_csv(f"../../../sourcedata/non_imaging_data/nback_RT_ACC/nback_Accuracy_RTime.csv",low_memory=False)
        nback_data=nback_data.rename(columns = {'subject':'subs'})
        if ses == 'baseline':
            nback_data=nback_data[nback_data['timepoint']=='baseline']
        else:
            nback_data=nback_data[nback_data['timepoint']=='1year']

    nback_data.drop(columns=['timepoint'], inplace = True)

    for column in nback_data.columns[1:]:
        nback_data[column] = nback_data[column] - nback_data[column].mean()
        nback_data[column].fillna(0, inplace=True)

    merged_data = pd.merge(df_subs, nback_data, on='subs')
    
    return merged_data
    

In [None]:
def plot_paired_difference_distribution(MM_baseline_df,MM_1year_df,column):
    
    paired_differences = calc_paired_diff(MM_baseline_df,MM_1year_df,column)
    
    # Plot the distribution of the column using a histogram
    plt.hist(paired_differences, bins='auto', edgecolor='black',label='Paired differences')
    
    # Add vertical lines for means
    plt.axvline(np.mean(paired_differences), color='blue', linestyle='dashed', linewidth=2, label=f'Paired differences mean')

    # Add labels and title
    plt.xlabel(f'{column} Values')
    plt.ylabel('Frequency')
    plt.title(f'Distribution of {column} Values')

    # Show the plot
    plt.show()
    

In [None]:
def plot_two_distributions(df1,df2,column):
    plt.hist(df1[column], bins='auto', alpha=0.5, label=f'{df1}', edgecolor='black')
    plt.hist(df2[column], bins='auto', alpha=0.5, label=f'{df2}', edgecolor='black')

    # Add vertical lines for means
    plt.axvline(df1[column].mean(), color='blue', linestyle='dashed', linewidth=2, label=f'{df1} mean')
    plt.axvline(df2[column].mean(), color='orange', linestyle='dashed', linewidth=2, label=f'{df2} mean')

    # Add labels and title
    plt.xlabel(f'{column} Values')
    plt.ylabel('Frequency')
    plt.title(f'Distribution of {column} Values')


    # Show the plot
    plt.show()

In [None]:
def plot_distribution(df, column):

    # Plot the distribution of the column using a histogram
    plt.hist(df[column], bins='auto', edgecolor='black')

    # Add labels and title
    plt.xlabel(f'{column} Values')
    plt.ylabel('Frequency')
    plt.title(f'Distribution of {column} Values')

    # Show the plot
    plt.show()

In [None]:
def create_df(group,ses,task,contrast):
    df_subs = create_subs_df(group,ses,task,contrast)
    if group == 'MM':
        df_subs = rm_CUD_baseline_subs(df_subs)
    merged_data = create_merged_df(df_subs,group,ses)
    return merged_data

In [None]:
def rm_CUD_baseline_subs(df_subs):
    
    #subs with CUD at baseline
    excluded_subs = ['MM_014','MM_188','MM_197','MM_217','MM_228','MM_239','MM_241'] 

    #remove rows with subs that had CUD at baseline
    df_subs = df_subs[~df_subs['subs'].isin(excluded_subs)]
    
    return df_subs

In [None]:
#perform t-test 

def t_test_between_HC_and_MM_baseline(HC_baseline_df,MM_baseline_df):
    dict_results={}
    
    for column in HC_baseline_df.columns[6:]:
        
        x1 = HC_baseline_df[column]
        x2 = MM_baseline_df[column]
        pop_mean = np.mean(x2)-np.mean(x1)
        
        tstat, pval = stats.ttest_ind(x1, x2, axis=0, equal_var=True, nan_policy='raise', permutations=None, alternative='two-sided')
        
        rounded_pop_mean = round(pop_mean,3)
        rounded_pval = round(pval,3)
        
        dict_results[column] = {'difference of means':rounded_pop_mean, 'p value':rounded_pval}
    
    return dict_results


In [None]:
def calc_paired_diff(MM_baseline_df,MM_1year_df,column):
    
    MM_baseline_subs = MM_baseline_df['subs']
    MM_1year_subs = MM_1year_df['subs']
    
    MM_paired_subs = set(MM_baseline_subs).intersection(set(MM_1year_subs))
        
    xdiff = [float(MM_1year_df[MM_1year_df['subs'] == sub][column])-float(MM_baseline_df[MM_baseline_df['subs'] == sub][column]) for sub in MM_paired_subs]
    
    return xdiff

In [None]:
#perform t-test 

def paired_t_test_between_MM_baseline_1year(MM_baseline_df,MM_1year_df):
    dict_results={}
    
    for column in MM_baseline_df.columns[6:]:
        
        xdiff = calc_paired_diff(MM_baseline_df,MM_1year_df,column)
        #1year - baseline difference
        pop_mean = np.mean(xdiff)
        
        tstat, pval = stats.ttest_1samp(xdiff, popmean=0, nan_policy='raise', alternative='two-sided')

        rounded_pop_mean = round(pop_mean,3)
        rounded_pval = round(pval,3)
        
        dict_results[column] = {'mean difference':rounded_pop_mean, 'p value':rounded_pval}
        
    return dict_results


In [None]:
def save_df(df,group,ses):
    
    #create paths to output dir if not exist
    derivatives_path = '../../../derivatives'
    nilearn_output_path = os.path.join(derivatives_path, 'behavioral', 'task-nback')
    if not os.path.isdir(nilearn_output_path):
        os.makedirs (nilearn_output_path)
        
    df.to_csv(f'../../../derivatives/behavioral/task-nback/nback-ACC_nback-RT_group-{group}_ses-{ses}.csv')
    
    return


In [None]:
inputs = [('HC','baseline'),('MM','baseline'),('MM','1year')]

input_dfs = {}
output_dfs = {}

for group,ses in inputs:
    df = create_df(group,ses,'nback','twoback-zeroback')
    save_df(df,group,ses)
    input_dfs[f'{group}_{ses}_df'] = df
    #if desired a distribution for any variable of an individual group can be plotted
    #plot_distribution(df,'RT_2b_cor')

   
ttest_2samp = t_test_between_HC_and_MM_baseline(input_dfs['HC_baseline_df'],input_dfs['MM_baseline_df'])
output_dfs[f'HC_baseline_vs._MM_baseline_ttest'] = pd.DataFrame.from_dict(ttest_2samp)

#if desired the two distributions for any variable can be plotted together with the means highlighted 
#plot_two_distributions(input_dfs['HC_baseline_df'],input_dfs['MM_baseline_df'],'RT_all_cor')


ttest_paired = paired_t_test_between_MM_baseline_1year(input_dfs['MM_baseline_df'],input_dfs['MM_1year_df'])
output_dfs[f'MM_baseline_vs._MM_1year_paired_ttest'] = pd.DataFrame.from_dict(ttest_paired)

#if desired the difference distributions for any variable can be plotted together with the mean highlighted 
#plot_paired_difference_distribution(input_dfs['MM_baseline_df'],input_dfs['MM_1year_df'],'RT_all_cor')

final_df = pd.DataFrame()

for title in output_dfs.keys():
    df = output_dfs[title]
    df['comparison'] = [title, np.nan]
    df.rename(columns={'ACC_all': 'Combined accuracy', 'RT_all_cor': 'Combined reaction time', 
                               'ACC_0b': 'Accuracy of zero-back trials', 'RT_0b_cor': 'Reaction time of zero-back trials', 
                               'ACC_2b': 'Accuracy of two-back trials', 'RT_2b_cor': 'Reaction time of two-back trials'}, inplace=True)

    final_df = pd.concat([final_df,df])

final_df = final_df.T 

display(final_df)
final_df.to_csv(f'../../../derivatives/behavioral/task-nback/nback-ACC_nback-RT_comparison.csv')
