# Automated QC
## Exclude subjects without phenotypic or neuroimaging data, age 19+ and poor quality scans
## Run this after QSIPrep and sMRIPrep, and before beginning FBA
### If replicating this study, make sure your dataset is BIDS-compliant and:
1) Update the variable "bids_dir" to reflect where your data is stored
2) Have your phenotypic data stored in the BIDS code directory with the name "HBN_query.csv"
3) Make sure you have collected at the very least: Identifiers, Basic demographics, TOWRE scores, WISC scores, Barratt, and EHQ scores from the HBN data portal

# 1) Import packages and load data
### Make sure to read the instructions in the github and the preamble to this notebook

In [None]:
import pandas as pd
import numpy as np
import os

# CHANGE THIS VARIABLE TO REFLECT WHERE YOUR HBN BIDS DATA LIVE ()
bids_dir = '/PATH/TO/HBN_BIDS_ROOT' # Path should end with a '/'

qc_thresh = .5 # Any subject with an image QC prediction lower than this is excluded

# THESE VARIABLES SHOULD NOT CHANGE IF THE ANALYSIS WAS RUN ACCORDING TO INSTRUCTIONS
code_dir = bids_dir+'code/'
derivatives_dir = bids_dir+'derivatives/'
pod2_dir = derivatives_dir+'qsiprep/' # HBN-POD2 data
freesurfer_dir = derivatives_dir+'freesurfer/'

# LOAD THE PHENOTYPIC DATA
HBN_query = pd.read_csv(code_dir+'HBN_query.csv') # This should point to the phenotypic file downloaded from LORIS
participants_csv = pd.read_csv(pod2_dir+'participants.tsv',delimiter='\t')

# GET SUBJECTS WITH PHENOTYPIC DATA
query_subs = ['sub-'+name[:-11] for name in HBN_query['Identifiers']]
pod2_subs = list(participants_csv['subject_id'])
pod2_subs.sort()

# This is where output variables will be stored
out_variable_dir = os.getcwd()+'/output_variables/'
if os.path.isdir(out_variable_dir) == False:
    os.makedirs(out_variable_dir)

# This is where data for subjects passing QC will go
qsiprep_fba_dir = derivatives_dir+'qsiprep_fba_subs/'
if os.path.isdir(qsiprep_fba_dir) == False:
    os.makedirs(qsiprep_fba_dir)

In [None]:
# Output variables storing information of subjects who pass QC
subs_pass = []
query_inds_pass = []
pod2_inds_pass = []

for sub in pod2_subs:
    # CHECK IF SUB HAS PHENOTYPICA DATA
    if sub not in query_subs:
        continue
    pod2_ind = pod2_subs.index(sub)
    query_ind = query_subs.index(sub)
    
    # CHECK ELIGIBILITY CRITERIA
    # 1) QC
    qc_metric = participants_csv['xgb_qc_score'][pod2_ind] # QC Score
    # IF XGB SCORE NOT AVAILBALE, DEFAULT TO DL QC
    try: int(qc_metric)
    except:
        qc_metric = participants_csv['dl_qc_score'][pod2_ind]
    qc_pass = qc_metric > qc_thresh
    
    # 2) Acquisition
    acq = participants_csv['site_variant'][pod2_ind] # Acquisition parameters 
    acq_pass = '_64dir_Most_Common' in acq
    
    # 3) Age
    age = HBN_query['Basic_Demos,Age'][query_ind] # Age
    age_pass = age >= 6 and age < 19
    
    # 4) Phenotypic data
    TOWRE_pass = HBN_query['TOWRE,TOWRE_Valid'][query_ind]=='1' # Reading scores
    # Look for NaNs in TOWRE data
    try:
        test_for_nan=int(HBN_query['TOWRE,TOWRE_PDE_Scaled'][query_ind])
    except: TOWRE_pass=False
    try:
        test_for_nan=int(HBN_query['TOWRE,TOWRE_SWE_Scaled'][query_ind])
    except: TOWRE_pass=False
    Barratt_pass = HBN_query['Barratt,Barratt_Total_Edu'][query_ind] != '.' # SES
    EHQ_pass = HBN_query['EHQ,EHQ_Total'][query_ind] != '.' # Handedness
    WISC_pass = HBN_query['WISC,WISC_complete'][query_ind]=='1' # IQ
    
    # 5) FreeSurfer Stats
    FS_pass = os.path.exists(freesurfer_dir+sub+'/stats/aseg.stats')
    
    #if not qc_pass or not acq_pass or not age_pass or not TOWRE_pass or not Barratt_pass or not EHQ_pass or not WISC_pass:
    if not qc_pass or not acq_pass or not age_pass or not TOWRE_pass or not EHQ_pass or not FS_pass:
        # Uncomment line below if you want to see why a subject did not pass
        #print('Subject:',sub,', QC:',qc_pass,', acq:',acq_pass,', age:',age_pass,', TOWRE:',TOWRE_pass,
        #     ', Barratt:',Barratt_pass,', EHQ:',EHQ_pass,', WISC:',WISC_pass)
        pass
    else:
        # Save good subject data out
        subs_pass.append(sub)
        os.symlink(pod2_dir+sub, qsiprep_fba_dir+sub, target_is_directory=True)
        query_inds_pass.append(query_ind)
        pod2_inds_pass.append(pod2_ind)
    np.save(out_variable_dir+'subs.npy',subs_pass,allow_pickle=True)
    np.save(out_variable_dir+'query_inds.npy',query_inds_pass,allow_pickle=True)
    np.save(out_variable_dir+'pod2_inds.npy',pod2_inds_pass,allow_pickle=True)    

# Generate population template subjects
### make a dataframe to workwith temporaily

In [None]:
sex = np.asarray(participants_csv['sex'][pod2_inds_pass])
age = np.asarray(participants_csv['age'][pod2_inds_pass])
qc = []
swe = []
pde = []
swe_key = 'TOWRE,TOWRE_SWE_Scaled'
pde_key = 'TOWRE,TOWRE_PDE_Scaled'
n = len(query_inds_pass)

# GET CLINICIAN DIAGNOSES
tr_rd = []
for index in range(n):
    # Get indexes for the two csv files
    ind_query = query_inds_pass[index]
    ind_pod2 = pod2_inds_pass[index]
    # Get Reading Dx info, see if there is a reading disability
    dxs = [HBN_query[key][ind_query] for key in HBN_query.keys() if 'DX' in key]
    dx_check = [('Impairment in Reading' in dx) for dx in dxs if type(dx)==str]
    # Get Reading scores
    swe_sub = int(HBN_query[swe_key][ind_query])
    pde_sub = int(HBN_query[pde_key][ind_query])
    swe.append(swe_sub)
    pde.append(pde_sub)
    # Get QC Scores
    qc_metric = participants_csv['xgb_qc_score'][ind_pod2]
    # if XGB scores not available, default to DL qc
    try: int(qc_metric)
    except:
        qc_metric = participants_csv['dl_qc_score'][ind_pod2]
    qc.append(qc_metric)
    
    if sum(dx_check)>0 and swe_sub<=85 and pde_sub<=85:
        tr_rd.append('RD')
    elif sum(dx_check)==0 and swe_sub>=90 and pde_sub>=90:
        tr_rd.append('TR')
    else: tr_rd.append('other')
        
# POPULATE DATAFRAME
df = pd.DataFrame()
df['TR_RD'] = tr_rd
df['subjects'] = subs_pass 
df['AGE'] = age
df['SEX'] = sex
df['QC'] = qc
df['SWE'] = swe
df['PDE'] = pde

### Pick 40 subjects for population template
Split age range into 10 bins, within each select 2 males (one TD, one RD), 2 females (one TD, one RD)

NOTE: you may have to run this code block twice for the subject names to be written out properly

In [None]:
# Split age range into 10 bins
n_bins = 10
age_min = min(age)
age_max = max(age)
age_bins = np.linspace(age_min,age_max,n_bins+1)

# Initialize output
subs_pop = [] # population template subjects

# Loop over age bins
for bin_ind in range(n_bins):
    # Extract age range for bin
    bin_min = age_bins[bin_ind]
    bin_max = age_bins[bin_ind+1]
    print('\nAge Bin: [',bin_min,'-',bin_max,']')
    
    # Pick highest QC score in each age bin
    # Try for both male and female, and TR and RD
    try:
        subs_male_agerange_rd = df[['subjects','QC']][np.logical_and(df['SEX']=='M', 
                np.logical_and(df['TR_RD'] == 'RD', 
                np.logical_and(df['AGE'] <= bin_max, df['AGE'] >= bin_min)))]
        sub_male_agerange_rd_max_qc = subs_male_agerange_rd.iloc[0]['subjects']
        qc_male_agerange_rd_max_qc = subs_male_agerange_rd.iloc[0]['QC']
        subs_pop.append(sub_male_agerange_rd_max_qc)
        print('RD Male:', sub_male_agerange_rd_max_qc,', QC:',qc_male_agerange_rd_max_qc)
    except: print('no male RD in age bin')
    
    try:
        subs_male_agerange_tr = df[['subjects','QC']][np.logical_and(df['SEX']=='M', 
                np.logical_and(df['TR_RD'] == 'TR', 
                np.logical_and(df['AGE'] <= bin_max, df['AGE'] >= bin_min)))]
        sub_male_agerange_tr_max_qc = subs_male_agerange_tr.iloc[0]['subjects']
        qc_male_agerange_tr_max_qc = subs_male_agerange_tr.iloc[0]['QC']
        subs_pop.append(sub_male_agerange_tr_max_qc)
        print('TR Male:', sub_male_agerange_tr_max_qc,', QC:',qc_male_agerange_tr_max_qc)
    except: print('no male TR in age bin')
    
    try:
        subs_female_agerange_rd = df[['subjects','QC']][np.logical_and(df['SEX']=='F', 
                np.logical_and(df['TR_RD'] == 'RD', 
                np.logical_and(df['AGE'] <= bin_max, df['AGE'] >= bin_min)))]
        sub_female_agerange_rd_max_qc = subs_female_agerange_rd.iloc[0]['subjects']
        qc_female_agerange_rd_max_qc = subs_female_agerange_rd.iloc[0]['QC']
        subs_pop.append(sub_female_agerange_rd_max_qc)
        print('RD Female:', sub_female_agerange_rd_max_qc,', QC:',qc_female_agerange_rd_max_qc)
    except: 
        print('no female RD in age bin')

    
    try:
        subs_female_agerange_tr = df[['subjects','QC']][np.logical_and(df['SEX']=='F', 
                np.logical_and(df['TR_RD'] == 'TR', 
                np.logical_and(df['AGE'] <= bin_max, df['AGE'] >= bin_min)))]
        sub_female_agerange_tr_max_qc = subs_female_agerange_tr.iloc[0]['subjects']
        qc_female_agerange_tr_max_qc = subs_female_agerange_tr.iloc[0]['QC']
        subs_pop.append(sub_female_agerange_tr_max_qc)
        print('TR Female:', sub_female_agerange_tr_max_qc,', QC:',qc_female_agerange_tr_max_qc)
    except: print('no female TR in age bin')

# Sort subjects and get their indexes in the overall cohort for future reference
subs_pop.sort()
inds_pop = np.transpose([np.where(np.asarray(query_subs)==sub)[0][0] for sub in subs_pop])

# Save out subejcts as array and text file
np.save(out_variable_dir+'subs_template.npy',subs_pop,allow_pickle=True)
np.save(out_variable_dir+'inds_template.npy',subs_pop,allow_pickle=True)
textfile = open(code_dir+"/fba/subs_template.txt", "w")
for sub in subs_pop:
    textfile.write(sub + "\n")
textfile.close()

print('\n population template subjects have been written out to',code_dir+"/fba/subs_template.txt")

## YOU CAN NOW BEGIN THE FBA BASH SCRIPT CODES (scripts 1-5)