In [None]:
import pandas as pd
import numpy as np
import scipy.stats
from scipy.spatial import distance
import matplotlib.pyplot as plt
#import seaborn as sns

import os.path as op
import os
import glob
import shutil
import datetime
import math

from nilearn import plotting
from nilearn import image
from nilearn import masking
import nilearn

from datetime import datetime

In [None]:
datestring = datetime.now()
print(datestring)
timestampStr = datestring.strftime("%b%d_%Y")
print(timestampStr)

In [None]:
# Get [fear, disgust] maskfiles
proj_dir = 'PATH_TO_PROJECT'
path_to_masks = op.join(proj_dir, 'TIER/analysis_data/ROI_MASKS/')
maskfiles = []
# I am adding this next line for brevity in testing mode 
#maskfiles += glob.glob(path_to_masks + 'disgust*IFG.nii.gz')

# THE NEXT THREE LINES ARE REAL 
maskfiles += glob.glob(path_to_masks + 'disgust*')
maskfiles += glob.glob(path_to_masks + 'fear*')
maskfiles += glob.glob(path_to_masks + 'emo_reg*')
print(maskfiles)


In [None]:
# Set up variables for main extraction
tier_dir = op.join(proj_dir, 'TIER')
copes_dir = op.join(tier_dir,'original_data','copes')
varcopes_dir = op.join(tier_dir,'original_data','varcopes')
zstats_dir = op.join(tier_dir,'original_data','zstats')
first_level_dir = op.join(proj_dir, 'Analysis', 'first_level_standard')
path_to_goodvoxel_masks = op.join(proj_dir, 'TIER/analysis_data/ROI_masks_goodvoxels/')

# create some directories for outputs 
outputs_dir = op.join(tier_dir, 'analysis_data/analyzed_ROI_data/multivariate_ROI')
top_voxel_mask_outputdir = op.join(outputs_dir, 'subject_top_voxel_masks_' + timestampStr)
os.makedirs(top_voxel_mask_outputdir, exist_ok = True)


# set the output file names 
fname_rowbyrow = op.join(outputs_dir, 'multivariate_correlations_and_distances_' + timestampStr + '_preConvert_byRow.csv')
# and for after vectors format conversion: 
fname_converted = op.join(outputs_dir, 'multivariate_correlations_and_distances_' + timestampStr + '_convertedVectors.csv')
# and for CSV averaged across folds
fname_foldsAve = op.join(outputs_dir, 'multivariate_correlations_and_distances_' + timestampStr + '_aveAcrossFolds.csv')


# set the contrasts 
cons = ['con_1_tgn-gt-cgn', 
        'con_2_cgd-gt-cgn',
        'con_3_cgf-gt-cgn',
        'con_4_cgf-gt-cgd',
        'con_5_cgd-gt-cgf',
        'con_6_cgn',
        'con_7_cgd',
        'con_8_cgf',
        'con_9_tgn', 
        'con_10_tgn-gt-cgd', 
        'con_11_tgn-gt-cgf']

#cons = ['con_1_tgn-gt-cgn']


# set the list of tasks 
tasks = ['read']
# there's only one for now, so we can skip a loop by setting it here. If you add more tasks, you need to build another layer of for-loops over each task
task = tasks[0]


ROIs = maskfiles
# just get the name of the masks (for plotting later)
masklabels = list(map(lambda x: x.split('/')[-1].split('.')[0], maskfiles))

# either take the whole ROI or just the top 100 voxels -- we'll do both and loop through each option
num_top_voxels_options = [100]
#num_top_voxels_options = [100, 'whole', 'balanced']

# Maximum number of folds (including the fold without any exclusions)
# Should = number of runs + 1
num_folds = 5

# get the info for subjects and exclusions 
master_data = '/nese/mit/group/saxelab/projects/EMOfd/data/subject_lists/EMOfd_subject_info_211026.csv'
df = pd.read_csv(master_data)

# this will be a variable in the output CSV .. 
sub_proj = 'MIT'

In [None]:
df_mag = pd.DataFrame(columns=['acquisitionID', 'participantID', 'source', 'task', 'experiment',
                               'roi', 'contrast_for_selection', 'contrast1', 'contrast2', 'fold', 'excluded_run', 'voxels_in_roi_mask', 
                               'good_voxels_in_roi','pearson_r', 'p_score', 'distance',
                               'method','mean_top_voxels_combined_fold','mean_top_voxels_left_out', 'missing_data_flag',
                               'vector1_curfold_topextracted_con1', 'vector2_leftout_topextracted_con2'])

                                                                                                                                
df_pearson = pd.DataFrame(columns=masklabels)
df_pscore = pd.DataFrame(columns=masklabels)

warningfile = 'warnings_' + timestampStr + '.txt'
print(warningfile)
#writeissue(warningfile, message)
def writeissue(filename, message):
    f = open(filename, 'a')
    f.write(message + '\n\n\n') 
    f.close()


In [None]:
filesubs = pd.unique(df.loc[(df.exclude_from_analysis == False), 'acquisitionID']).tolist()

In [None]:
cons = ['con_6_cgn',
        'con_7_cgd',
        'con_8_cgf',
        'con_9_tgn']
    
for num_top_voxels in num_top_voxels_options:
    print('NUM TOP VOXEL TYPE: ', num_top_voxels)
    
    for sub in filesubs:
        print('SUBJECT: ', sub)
        participantID = df.loc[df.acquisitionID == sub,'participantID'].values[0]
        experiment = df.loc[df.acquisitionID == sub,'experiment'].values[0]
    
        for roi in ROIs: 
            mask_img_fname = roi
            roi = op.basename(roi).split('.')[0]
            print('WORKING ON ROI: ', roi)

            #FIND THE SUBJECT'S GOOD-VOXEL ROI MASK (pre-written based on varcope of all contrasts in exclude-none fold)
            goodvox_ROI_fname = '{}/ROI_GOODVOX_{}_task-{}_{}.nii.gz'.format(path_to_goodvoxel_masks, sub, task, roi)
            matches_goodvox_ROI = glob.glob(goodvox_ROI_fname)
            mask_img = image.load_img(matches_goodvox_ROI[0])

            # load the ROI image
            mask_img_data = mask_img.get_fdata()
            print(mask_img_data.shape)

            # find N voxels in the good-voxel-only ROI 
            voxel_filter = np.abs(mask_img_data) > 0.0
            good_voxels_in_roi = np.sum(voxel_filter)
            np.set_printoptions(threshold=good_voxels_in_roi+100)


            # load the original ROI image to determine missing_data_flag 
            mask_img_ORIG = image.load_img(mask_img_fname)
            mask_img_ORIG_data = mask_img_ORIG.get_fdata()
            voxels_in_orig_roi = np.sum(np.abs(mask_img_ORIG_data) > 0.0)
            
            print("GOOD VOXELS IN ROI PROPORTION:")
            print(good_voxels_in_roi/voxels_in_orig_roi)
            if good_voxels_in_roi/voxels_in_orig_roi < .80:
                missing_data_flag = True
                message = 'missing data flag set true for: {}, {}'.format(sub, roi)
                writeissue(warningfile, message)
            else:
                missing_data_flag = False


            for fold in range(1, num_folds):
                print('WORKING ON FOLD: ', fold)

                for con in cons: 

                        
                    contrast_for_selection = 'con_4_cgf-gt-cgd'
                    combZ_current_fold = '{}/{}_{}_fold_{}_exclude_{}_{}_zstat.nii.gz'.format(zstats_dir, sub, task, fold, 'run*', contrast_for_selection)
                    matches_combZ_current_fold = glob.glob(combZ_current_fold)

                    if len(matches_combZ_current_fold) > 1:
                        print('There are duplicate fear files present.')
                        message = 'There are duplicate files present for: ' + combZ_current_fold
                        writeissue(warningfile, message)
                        pass
                    elif len(matches_combZ_current_fold) < 1:
                        print(combZ_current_fold, ' is missing!')
                        message = 'The file is missing: ' + combZ_current_fold
                        writeissue(warningfile, message)
                        pass
                    else:
                        combZ_img = image.load_img(matches_combZ_current_fold[0])
                        print(combZ_img.shape)


                        # mask our train contrast using the good-voxel ROI mask -- i.e. turn every voxel in cope not in ROI to 0
                        masked_combZ_img = image.math_img("img1 * img2", img1 = combZ_img, img2 = mask_img)

                        # actually get the data (real values) from the masked copes 
                        masked_combZ_data = masked_combZ_img.get_fdata()


                        #NEXT: DROP THE ZEROS
                        # create copies 
                        nanned_combZ_data = masked_combZ_data.copy()
                        nanned_combZ_data[nanned_combZ_data == 0] = np.nan
                        combZ_roi_nans_inds = (-nanned_combZ_data).argsort(axis = None)

                        

                        ## GET WHICH VOXELS ARE TOP 
                        if num_top_voxels == 'whole':
                            nVox = np.count_nonzero(voxel_filter) #however many voxels there are in the good-voxel ROI
                            # still create a mask based on the masked z-data, but just take the WHOLE thing... unncessary, but whatever!
                            masked_combZ_data[np.unravel_index(combZ_roi_nans_inds[nVox:], masked_combZ_data.shape)] = np.nan
                            top_voxel_fname = '{}/{}_task-{}_fold-{}_{}_whole_good-ROI.nii.gz'.format(top_voxel_mask_outputdir, sub, task, fold, roi)
                        
                        elif num_top_voxels == 'balanced':
                            to_play = masked_combZ_data[np.unravel_index(combZ_roi_nans_inds, masked_combZ_data.shape)]
                            zero_ind = (np.where(to_play==0)[0][0])
                            if (np.count_nonzero(voxel_filter)) < 100:
                                nVox = int(np.floor(np.count_nonzero(voxel_filter)/2.0))
                            else:
                                nVox = 50
                            masked_combZ_data[np.unravel_index(combZ_roi_nans_inds[nVox:(zero_ind-nVox)], masked_combZ_data.shape)] = np.nan
                            masked_combZ_data[np.unravel_index(combZ_roi_nans_inds[zero_ind:], masked_combZ_data.shape)] = np.nan                        
                            top_voxel_fname = '{}/{}_task-{}_fold-{}_{}_balanced_top_contrast-{}_from-zstat.nii.gz'.format(top_voxel_mask_outputdir, sub, task, fold, roi, contrast_for_selection)
                        
                        elif num_top_voxels == 100:
                            combZ_roi_nans_inds = (-np.absolute(nanned_combZ_data)).argsort(axis = None)
                            nVox = min(num_top_voxels, np.count_nonzero(voxel_filter))
                            masked_combZ_data[np.unravel_index(combZ_roi_nans_inds[nVox:], masked_combZ_data.shape)] = np.nan
                            top_voxel_fname = '{}/{}_task-{}_fold-{}_{}_top-100_contrast-{}_from-zstat.nii.gz'.format(top_voxel_mask_outputdir, sub, task, fold, roi, contrast_for_selection)

                            
                        # GET BINARY MASKS (1,0) OF WHICH VOXELS ARE TOP IN OUR TRAIN DATA 
                        top_voxel_mask = masked_combZ_data.copy()
                        top_voxel_mask[~np.isnan(top_voxel_mask)] = 1
                        top_voxel_mask[np.isnan(top_voxel_mask)] = 0


                        # save the top voxel masks to files 
                        top_voxel_img = image.new_img_like(combZ_img, top_voxel_mask)
                        top_voxel_img.to_filename(top_voxel_fname)
                        

                        ### NOW MOVING ON TO PULLING BETAS FROM TOP VOXELS 
                        
                        # get the excluded run 
                        splits_1 = test.split('exclude_')
                        splits_2 = splits_1[1].split('_')
                        excluded_run = splits_2[0]
                            
                        # get cope (betas) from top voxels in 3/4 fold

                        #print('WORKING ON CURRENT-FOLD CONDITION: ', con)
                        current_fold_cope_fname = '{}/{}_{}_fold_{}_exclude_{}_{}_cope.nii.gz'.format(copes_dir, sub, task, fold, 'run*', con)
                        matches_current_fold_cope_fname = glob.glob(current_fold_cope_fname)

                        if len(matches_current_fold_cope_fname) != 1:
                            print('Incorrect # of files present for:')
                            print(current_fold_cope_fname)
                            message = 'Incorrect # of files present for current-fold condition-cope: {},{},{},{}'.format(sub, fold, con, roi)
                            writeissue(warningfile, message)

                        # load the image 
                        current_fold_cope_img = image.load_img(matches_current_fold_cope_fname[0])

                        # mask w/ TOP VOXEL MASK 
                        top_voxel_masked_current_fold_cope_img = image.math_img("img1 * img2", img1 = current_fold_cope_img, img2 = top_voxel_img)

                        # get data 
                        top_voxel_masked_current_fold_cope_data = top_voxel_masked_current_fold_cope_img.get_fdata()

                        # ravel 
                        top_voxel_masked_current_fold_cope_data[top_voxel_masked_current_fold_cope_data==0] = np.nan 
                        tempravel_top_voxel_masked_current_fold_cope_data = np.ravel(top_voxel_masked_current_fold_cope_data)
                        raveled_top_current_fold = tempravel_top_voxel_masked_current_fold_cope_data[~np.isnan(tempravel_top_voxel_masked_current_fold_cope_data)]

                        ## start loop for test2 -- i.e. the leftout cond 

                        for con2 in cons:
                            #print('WORKING ON LEFT-OUT CONDITION: ', con2)
                            leftout_cope_fname = '{}/{}/{}/model/{}/{}_cope.nii.gz'.format(first_level_dir, sub, task, excluded_run, con2)
                            matches_leftout_cope_fname = glob.glob(leftout_cope_fname)

                            if len(matches_leftout_cope_fname) != 1:
                                print('Incorrect # of files present for:')
                                print(leftout_cope_fname)
                                message = 'Incorrect # of files present for left-out condition-cope: {},{},{},{}'.format(sub, excluded_run, con2, roi)
                                writeissue(warningfile, message)

                            # load the image 
                            leftout_cope_img = image.load_img(matches_leftout_cope_fname[0])

                            # mask w/ TOP VOXEL MASK
                            top_voxel_masked_leftout_fold_cope_img = image.math_img("img1 * img2", img1 = leftout_cope_img, img2 = top_voxel_img)
                            # get data 
                            top_voxel_masked_leftout_cope_data = top_voxel_masked_leftout_fold_cope_img.get_fdata()

                            # ravel 
                            top_voxel_masked_leftout_cope_data[top_voxel_masked_leftout_cope_data==0] = np.nan
                            tempravel_top_voxel_masked_leftout_cope_data = np.ravel(top_voxel_masked_leftout_cope_data)
                            raveled_top_leftout = tempravel_top_voxel_masked_leftout_cope_data[~np.isnan(tempravel_top_voxel_masked_leftout_cope_data)]

                            # can save raveled_top_leftout here 


                            ## GET CORRELATION 
                            if len(raveled_top_current_fold) < 2 or (len(raveled_top_current_fold) != len(raveled_top_leftout)):
                                print("ROI = {}, -- CUR-FOLD len: {}, LEFT-OUT len: {}".format(roi, len(raveled_top_current_fold), len(raveled_top_leftout)))
                                message = 'mismatch between top-voxel-mask of current-fold condition cope and leftout condition cope: {},fold {}, excluded {}, con1 {}, con2 {}, {}; cur-fold-length= {}, left-out-length= {}'.format(sub, fold, excluded_run, con, con2, roi, len(raveled_top_current_fold), len(raveled_top_leftout))
                                writeissue(warningfile, message)
                                continue
                            pearson_r = scipy.stats.pearsonr(raveled_top_current_fold, raveled_top_leftout)
                            #print(pearson_r)

                            mean_top_vox_curfold = np.nanmean(top_voxel_masked_current_fold_cope_data)
                            mean_top_vox_leftout = np.nanmean(top_voxel_masked_leftout_cope_data)


                            # get distance 
                            dist = distance.euclidean(raveled_top_current_fold, raveled_top_leftout)
                            
                        
                            
                            if num_top_voxels == 'whole':
                                sel_cond = 'NaN'
                            elif num_top_voxels == 'balanced': 
                                sel_cond = 'con_4_cgf-gt-cgd'
                            elif num_top_voxels == 100:
                                sel_cond = 'con_4_cgf-gt-cgd'

                            df_mag_currentrow = pd.DataFrame({'acquisitionID' : sub, 'participantID': participantID, 
                                                                    'source': sub_proj,
                                                                    'task' : task, 'experiment': experiment, 'roi' : roi,
                                                                    'contrast_for_selection': sel_cond,
                                                                    'contrast1' : con, 
                                                                    'contrast2': con2,
                                                                    'fold': fold,
                                                                    'excluded_run': excluded_run,
                                                                    'voxels_in_roi_mask': voxels_in_orig_roi, 
                                                                    'good_voxels_in_roi': good_voxels_in_roi,
                                                                    'pearson_r' : pearson_r[0],
                                                                    'p_score': pearson_r[1],
                                                                    'distance' : dist,
                                                                    'method' : num_top_voxels,
                                                                    'mean_top_voxels_combined_fold' : mean_top_vox_curfold,
                                                                    'mean_top_voxels_left_out' : mean_top_vox_leftout,
                                                                    'missing_data_flag' : missing_data_flag,
                                                                    'vector1_curfold_topextracted_con1': [raveled_top_current_fold], 
                                                                    'vector2_leftout_topextracted_con2': [raveled_top_leftout]})


                            if not os.path.isfile(fname_rowbyrow):
                           #     # if file doesn't exist, write it with column headers 
                                df_mag_currentrow.to_csv(fname_rowbyrow, index=False, header='column_names')
                            else: 
                           #     # else append w/o column headers 
                                df_mag_currentrow.to_csv(fname_rowbyrow, mode='a', index=False, header=False)



           #                 df_mag = df_mag.append({'acquisitionID' : sub, 'participantID': participantID, 
           #                                                         'source': sub_proj,
           #                                                         'task' : task, 'experiment': experiment, 'roi' : roi,
           #                                                         'contrast1' : con, 
           #                                                         'contrast2': con2,
           #                                                         'fold': fold,
           #                                                         'excluded_run': excluded_run,
           #                                                         'voxels_in_roi_mask': voxels_in_roi, 
           #                                                         'good_voxels_in_roi': good_voxels_in_roi,
           #                                                         'pearson_r' : pearson_r[0],
           #                                                         'p_score': pearson_r[1],
           #                                                         'distance' : dist,
           #                                                         'method' : num_top_voxels,
           #                                                         'mean_top_voxels_combined_fold' : mean_top_vox_curfold,
           #                                                         'mean_top_voxels_left_out' : mean_top_vox_leftout,
           #                                                         'missing_data_flag' : missing_data_flag,
           #                                                         'vector1_curfold_topextracted_con1': raveled_top_current_fold, 
           #                                                         'vector2_leftout_topextracted_con2': raveled_top_leftout}, ignore_index = True)


            

In [None]:
def convert(string):
    ls = string[1:-1].split()
    return list(map(float, ls)) 

In [None]:
dfmag_whole = pd.read_csv(fname_rowbyrow)

In [None]:
dfmag_whole['vector1_curfold_topextracted_con1'] = dfmag_whole['vector1_curfold_topextracted_con1'].apply(convert)
dfmag_whole['vector2_leftout_topextracted_con2'] = dfmag_whole['vector2_leftout_topextracted_con2'].apply(convert)

dfmag_whole.to_csv(fname_converted, index=False, header="column_names")

In [None]:
dfmag_whole = pd.read_csv(fname_converted)

In [None]:
df_new = dfmag_whole.groupby(['participantID', 'roi', 
                     'contrast1', 'contrast2','method']).agg({'mean_top_voxels_combined_fold': ['mean'],
                                                              'mean_top_voxels_left_out': ['mean'], 
                                                              'pearson_r': ['mean'], 
                                                              'p_score': ['mean'], 
                                                              'distance': ['mean']})

df_new.columns = ['mean_top_voxels_combined_fold_aveAcrossFolds', 
                  'mean_top_voxels_left_out_aveAcrossFolds', 
                  'pearson_r_aveAcrossFolds', 'p_score_aveAcrossFolds', 
                  'distance_aveAcrossFolds']

df_re = df_new.reset_index()
#df_re = df_re.drop(columns=["method"])
df_re = df_re.sort_values(by=['participantID', 'roi', 'contrast1', 'contrast2'])

df_re.to_csv(fname_foldsAve, index=False, header='column_names')
