# Make Data Frame and Save Out Cohort Files for ModelArray
## This happens before running the GAMs
## Make sure by this time only subjects you want to include in the FBA are in the FBA derivatives directory

In [1]:
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pingouin
import seaborn as sns
import scipy
import os
import os.path
import statistics

# CHANGE THIS VARIABLE TO REFLECT WHERE YOUR HBN BIDS DATA LIVE ()
bids_dir = '/om4/group/gablab/data/hbn_bids/' # Path should end with a '/'

# THESE VARIABLES SHOULD NOT CHANGE IF THE ANALYSIS WAS RUN ACCORDING TO INSTRUCTIONS
code_dir = bids_dir+'code/'
derivatives_dir = bids_dir+'derivatives/'
pod2_dir = derivatives_dir+'qsiprep/'
fba_dir = derivatives_dir+'fba/'
freesurfer_dir = derivatives_dir+'freesurfer/'
out_variable_dir = os.getcwd()+'/output_variables/'

# LOAD THE PHENOTYPIC DATA
HBN_query = pd.read_csv(code_dir+'HBN_query.csv') # This should point to the phenotypic file downloaded from LORIS
participants_tsv = pd.read_csv(pod2_dir+'participants.tsv',delimiter='\t')

# GET SUBJECTS WITH PHENOTYPIC DATA
query_subs = ['sub-'+name[:-11] for name in HBN_query['Identifiers']]
pod2_subs = list(participants_tsv['subject_id'])
pod2_subs.sort()

# GET SUBJECTS INCLUDED IN FBA
subs_final= np.asarray([s.split('/')[-1] for s in glob.glob(fba_dir+'sub*')])
subs_final.sort()

# GET INDEXES OF SUBJECTS IN PHENOTYPIC FILES
pod2_inds_final = np.asarray([pod2_subs.index(sub) for sub in subs_final])
query_inds_final = np.asarray([query_subs.index(sub) for sub in subs_final])

# TRACTS TO ANALYZE
tracts = [] # If you want to add average tract stats to the data frame, add tract names below using TractSeg naming conventions
#tracts = ['AF_left','AF_right','SLF_I_left','SLF_I_right']

  return warn(
  HBN_query = pd.read_csv(code_dir+'HBN_query.csv') # This should point to the phenotypic file downloaded from LORIS


### Extract phenotypes for subjects passing QC

In [14]:
subs = subs_final
n = np.size(subs) # number of subjects
site = participants_tsv['scan_site_id'][pod2_inds_final]
sex = ['M' if sex_sub==0.0 else 'F' for sex_sub in np.asarray(HBN_query['Basic_Demos,Sex'][query_inds_final])]
age = np.asarray(HBN_query['Basic_Demos,Age'][query_inds_final])
ehi = np.asarray(HBN_query['EHQ,EHQ_Total'][query_inds_final]).astype(float)
# Convert EHI scores to handedness
hand = []
for h in ehi:
    if h < -40: hand.append('L')
    elif h >= -40 and h <= 40: hand.append('A')
    elif h > 40: hand.append('R')

# Reading Measures (Standardized)
towre = np.asarray(HBN_query['TOWRE,TOWRE_Total_Scaled'][query_inds_final]).astype(float)
swe = np.asarray(HBN_query['TOWRE,TOWRE_SWE_Scaled'][query_inds_final]).astype(float)
pde = np.asarray(HBN_query['TOWRE,TOWRE_PDE_Scaled'][query_inds_final]).astype(float)
# Raw Reading Measures
swe_r = np.asarray(HBN_query['TOWRE,TOWRE_SWE_Raw'][query_inds_final]).astype(float)
pde_r = np.asarray(HBN_query['TOWRE,TOWRE_PDE_Raw'][query_inds_final]).astype(float)
towre_r = swe_r + pde_r

# Make RD and TR groups
groups=[]
for index in range(n):
    ind_query = query_inds_final[index]
    # Get diagnostic information
    dxs = [HBN_query[key][ind_query] for key in HBN_query.keys() if 'DX' in key]
    # Check if participant was diagnosed with RD
    dx_check = [('Impairment in Reading' in dx) for dx in dxs if type(dx)==str]
    # Implement score and diagnosis classification
    if sum(dx_check)>0 and swe[index]<=85 and pde[index]<=85:
        groups.append('RD')
    elif sum(dx_check)==0 and swe[index]>=90 and pde[index]>90:
        groups.append('TR')
    else:
        groups.append('OTHER')
        
# These contain NaNs and have to be dealt with differently
ses = np.asarray(pd.to_numeric(HBN_query['Barratt,Barratt_Total_Edu'][query_inds_final],errors='coerce'))
wisc_vsi = np.asarray(pd.to_numeric(HBN_query['WISC,WISC_VSI'][query_inds_final],errors='coerce')) # Visual Spatial Index
wisc_vci = np.asarray(pd.to_numeric(HBN_query['WISC,WISC_VCI'][query_inds_final],errors='coerce')) # Verbal Comprehension Index

### Extract intracranial volumes, mean FD, and globally averaged fixel metrics

In [12]:
icvs = []
gfds = []
gfcs = []
gfdcs = []
motions = []
neighbor_corrs = []

for sub in subs_final:
    
    # ICV comes from FreeSurfer
    fs_stats_path = freesurfer_dir+sub+'/stats/aseg.stats'
    with open(fs_stats_path) as f:
        lines = f.readlines()
    # Extract the brain volume and add to list
    icv_text = lines[34]
    icv = float((icv_text.split(',')[-2]))
    icvs.append(icv)

    # Get quality measures (motion and neighbor correlation)     
    qc_path = glob.glob(pod2_dir+'/'+sub+'/ses-*/dwi/'+sub+'_ses-*ImageQC_dwi.csv')[0]
    qc = pd.read_csv(qc_path)
    motions.append(qc['mean_fd'][0])
    neighbor_corrs.append(qc['raw_neighbor_corr'][0])
    
    # Globally averaged fixel stats below
    gfd_path = fba_dir+'template/fixel_stats/gfd/'+sub+'_gfd.txt'
    with open(gfd_path) as f:
        lines = f.readlines()
    gfd = float(lines[0].strip())
    gfds.append(gfd)
    
    gfc_path = fba_dir+'template/fixel_stats/glog_fc/'+sub+'_glog_fc.txt'
    with open(gfc_path) as f:
        lines = f.readlines()
    gfc = float(lines[0].strip())
    gfcs.append(gfc)
    
    gfdc_path = fba_dir+'template/fixel_stats/gfdc/'+sub+'_gfdc.txt'
    with open(gfdc_path) as f:
        lines = f.readlines()
    gfdc = float(lines[0].strip())
    gfdcs.append(gfdc)


### Create DataFrame

In [15]:
# Make dataframe
df = pd.DataFrame()
df['subject_id'] = subs
df['GROUP'] = groups
df['AGE'] = age
df['SEX'] = sex
df['EHI'] = ehi
df['HAND'] = hand
df['TOWRE'] = towre
df['SWE'] = swe
df['PDE'] = pde
df['SWE_RAW'] = swe_r
df['PDE_RAW'] = pde_r
df['TOWRE_RAW'] = towre_r
df['SES'] = ses
df['WISC_VSI']=wisc_vsi
df['WISC_VCI']=wisc_vci
df['ICV'] = icvs
df['logICV'] = np.log(icvs)
df['gFD'] = gfds
df['gFC'] = gfcs
df['gFDC'] = gfdcs
df['MOTION'] = motions
df['N_CORR'] = neighbor_corrs
df['SITE'] = np.asarray(site)
# Add tract averages if of interest
for tract in tracts:
    df[tract+'_fd'] = [float(open(fba_dir+'template/tractstats/fd/'+sub+'/'+tract+'.txt').readlines()[0].strip()) for sub in subs]
    df[tract+'_fc'] = [float(open(fba_dir+'template/tractstats/fc/'+sub+'/'+tract+'.txt').readlines()[0].strip()) for sub in subs]
    df[tract+'_fc'] = [float(open(fba_dir+'template/tractstats/fdc/'+sub+'/'+tract+'.txt').readlines()[0].strip()) for sub in subs]
# Save out dataframe
df.to_pickle(out_variable_dir+'df.pkl')

In [16]:
df

Unnamed: 0,subject_id,GROUP,AGE,SEX,EHI,HAND,TOWRE,SWE,PDE,SWE_RAW,...,WISC_VSI,WISC_VCI,ICV,logICV,gFD,gFC,gFDC,MOTION,N_CORR,SITE
0,sub-NDARAA947ZG5,TR,13.627880,M,80.00,R,102.0,106.0,98.0,85.0,...,84.0,92.0,1.683538e+06,14.336408,0.249560,-0.002229,0.254570,0.812289,0.617743,CBIC
1,sub-NDARAA948VFH,TR,7.982660,F,90.00,R,103.0,109.0,96.0,63.0,...,94.0,111.0,1.357012e+06,14.120796,0.305954,-0.022630,0.300181,0.367211,0.818152,RU
2,sub-NDARAB708LM5,OTHER,9.208532,M,33.35,A,87.0,103.0,73.0,71.0,...,117.0,92.0,1.833313e+06,14.421635,0.264511,0.124419,0.306069,0.769198,0.753416,CBIC
3,sub-NDARAC331VEH,TR,14.167351,M,67.80,R,100.0,98.0,102.0,82.0,...,129.0,121.0,1.563031e+06,14.262138,0.282996,0.030885,0.292144,0.154693,0.761416,CBIC
4,sub-NDARAC349YUC,TR,10.051791,F,86.67,R,103.0,109.0,97.0,77.0,...,89.0,98.0,1.651895e+06,14.317434,0.267326,0.067075,0.284775,0.386436,0.686540,CBIC
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
978,sub-NDARZW619BDL,TR,10.120123,M,100.00,R,112.0,100.0,123.0,71.0,...,108.0,106.0,1.421000e+06,14.166871,0.292376,-0.012170,0.288109,0.427320,0.738410,CBIC
979,sub-NDARZW873DN3,OTHER,13.436344,F,73.34,R,96.0,108.0,85.0,86.0,...,81.0,98.0,1.597672e+06,14.284058,0.307543,0.129453,0.358295,0.283051,0.709917,CBIC
980,sub-NDARZX163EWC,TR,9.898471,F,46.67,R,124.0,129.0,116.0,89.0,...,94.0,100.0,1.507703e+06,14.226098,0.290473,0.028136,0.297724,0.172784,0.758520,CBIC
981,sub-NDARZY101JNB,TR,7.517111,M,40.00,A,97.0,102.0,93.0,47.0,...,111.0,116.0,1.519813e+06,14.234098,0.292752,0.086046,0.318926,0.209331,0.833693,CBIC


## Make cohort files for ModelArray analyses

In [17]:
# Make output folders
if os.path.isdir(fba_dir+'template/modelarray_inputs') == False:
    os.makedirs(fba_dir+'template/modelarray_inputs')
if os.path.isdir(fba_dir+'template/modelarray_outputs') == False:
    os.makedirs(fba_dir+'template/modelarray_outputs')
# Make CSVs for each metric containing covariates of interest
for metric in ['fd','fdc','log_fc','fa_DKI','md_DKI','kfa_DKI','mk_DKI','ICVF_NODDI','OD_NODDI']:
    df_metric = df.copy()
    df_metric['scalar_name'] = [metric for ind in range(len(df))]
    df_metric['source_file'] = ['fixel_stats/'+metric+'_smooth/'+sub+'_'+metric+'.mif' for sub in df['subject_id']]
    df_metric.to_csv(fba_dir+'template/modelarray_inputs/cohort_'+metric+'.csv',index=False)
    df_metric_group = df_metric[df['GROUP']!='OTHER']
    df_metric_group.to_csv(fba_dir+'template/modelarray_inputs/cohort_'+metric+'_group.csv',index=False)