In [1]:
import itertools

%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt

from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.multiclass import OneVsOneClassifier
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, balanced_accuracy_score, confusion_matrix 
from sklearn.metrics import precision_score, recall_score, f1_score, precision_recall_fscore_support

In [2]:
# Fix random seed 
np.random.seed(13)

# Function Definitions 

In [3]:
metrics = ['accuracy', 'balanced_accuracy', 'precision', 'recall', 'f1_score']

def compute_scores(y_true, y_pred): 
    d = {}
    
    for metric in metrics: 
        if metric == 'accuracy': 
            d[metric] = accuracy_score(y_true, y_pred)
        elif metric == 'balanced_accuracy': 
            d[metric] = balanced_accuracy_score(y_true, y_pred)
        elif metric == 'precision': 
            d[metric] = precision_score(y_true, y_pred)
        elif metric == 'recall': 
            d[metric] = recall_score(y_true, y_pred)
        elif metric == 'f1_score': 
            d[metric] = f1_score(y_true, y_pred)
        else: 
            raise ValueError('You provided a non-supported evaluation metric.')
            
    return d

In [4]:
def mean_imputer(x, y):
    return np.where(np.isnan(x), np.ma.array(x, mask=np.isnan(x)).mean(axis=0), x),\
            np.where(np.isnan(y), np.ma.array(x, mask=np.isnan(x)).mean(axis=0), y) 

In [23]:
def binary_classification(features, labels, test_splits, disorders, scalers, estimator, 
                          param_grid, scale_features, scenario, train_mode, verbose=True):
    
    # Initialize results 
    results = {}
    predictions = {}

    for disorder in disorders: 
        if disorder not in ['Attention-Deficit/Hyperactivity Disorder']:
#         if disorder == 'No Diagnosis Given': 
            continue

        if scenario == 'Disorder vs No Diagnosis':
            # Preserve subjects that belong to either class
            mask = (labels[disorder] == 1) | (labels['No Diagnosis Given'] == 1)
            
            X = features[mask.values]
            Y = labels[mask.values]
            
        elif scenario == 'Disorder vs Rest':
            # Preserve all subjects
            X = features
            Y = labels
        else: 
            raise ValueError('Incorrect scenario value!')

        # List with all subject IDs in the dataset intersection 
        intersection_IDs = []
        for split in test_splits: 
            intersection_IDs = intersection_IDs + split
        
        if train_mode == 'Intersection': 
            datapoints = intersection_IDs
            # Binary labels for current disorder 
            y_true = Y[disorder].loc[datapoints]
        elif train_mode == 'Everything': 
            datapoints = X.index.values.tolist()
            # Binary labels for current disorder 
            y_true = Y[disorder].loc[datapoints]
        else: 
            raise ValueError('Incorrect training mode provided!')
        
        # Initialize predictions (make predictions only on the intersection)
        y_pred = pd.Series(data=np.empty(len(intersection_IDs)),index=intersection_IDs, dtype=int)
        
        for test_split in test_splits:
            # Train split
            train_split = list(set(datapoints) - set(test_split))
            
            if bool(set(train_split) & set(test_split)):
                raise ValueError('Training and Testing splits overlap!')
            
            X_train, X_test = X.loc[train_split].values, X.loc[test_split].values
            y_train, y_test = y_true.loc[train_split].values.astype(int), y_true.loc[test_split].values.astype(int)

            # Impute missing values
            X_train, X_test = mean_imputer(X_train, X_test)
            
            # Pre-processing 
            if scale_features:
                scaler = scalers[disorder]
                Z_train = scaler.fit_transform(X_train)
            else:
                Z_train = X_train

            N = len(y_train)
            N_0 = np.sum(y_train == 0)
            N_1 = np.sum(y_train == 1)
            
            param_grid['class_weight'].append({0: N/N_0, 1: (N_0/N_1)*N/N_1})
            
            # Hyper-parameter tuning 
            _classifier = estimator

            search = GridSearchCV(
                        estimator=_classifier,
                        param_grid=param_grid,
                        scoring='f1',
                        n_jobs=-1,
                        iid=False,
                        cv=StratifiedKFold(n_splits=5))

            search.fit(Z_train, y_train)

            best_params = search.best_params_

            # Train on the "best" configuration 
            classifier = estimator
            classifier.set_params(**best_params)
            classifier.fit(Z_train, y_train)

            # Testing 
            if scale_features: 
                Z_test = scaler.transform(X_test)
            else: 
                Z_test = X_test 
                
            y_pred.loc[test_split] = classifier.predict(Z_test)
    
        # Results 
        results[disorder] = compute_scores(y_true.loc[intersection_IDs].values.astype(int), y_pred.values.astype(int))
        predictions[disorder] = y_pred.values.astype(int)

        if verbose: 
            print('================================= {0} ================================='.format(disorder))

            print('accuracy {:.3f} balanced_accuracy {:.3f} precision {:.3f} recall {:.3f} f1_score {:.3f}' \
                      .format(results[disorder]['accuracy'], results[disorder]['balanced_accuracy'], 
                              results[disorder]['precision'], results[disorder]['recall'], results[disorder]['f1_score']))
            
    return results, predictions

# Behavioural Data 

In [6]:
behaviour_data = pd.read_csv('data/Behavioral/cleaned/HBNFinalSummaries.csv', low_memory=False)

# Drop patients with incomplete diagnosis
initial_size = behaviour_data.shape[0]
behaviour_data = behaviour_data[behaviour_data['NoDX'].isin(['Yes', 'No'])]
new_size = behaviour_data.shape[0]
print('Removing', initial_size - new_size, 'patients as their evaluation was incomplete.')

most_common_disorders = ['Attention-Deficit/Hyperactivity Disorder', 'Anxiety Disorders', 'Specific Learning Disorder',
                         'Autism Spectrum Disorder', 'Disruptive', 'No Diagnosis Given', 'Communication Disorder',
                         'Depressive Disorders']

category_columns = ['DX_' + str(i).zfill(2) + '_Cat' for i in range(1, 11)] +\
                   ['DX_' + str(i).zfill(2) + '_Sub' for i in range(1, 11)]

# find users that have no diagnosis within these top diseases
# filtering should cahnge anything as this should also happen at a later stage
mask = None
for col in category_columns:
    mask_col = behaviour_data[col].isin(most_common_disorders)
    if mask is None:
        mask = mask_col
    else:
        mask = mask | mask_col

initial_size = behaviour_data.shape[0]
behaviour_data = behaviour_data[mask]
behaviour_data = behaviour_data.reset_index(drop=True)
new_size = behaviour_data.shape[0]
behaviour_data.rename(columns={'EID': 'ID'}, inplace=True) 

print('Removing', initial_size - new_size, 'patients as their diagnoses were very uncommon.')

behaviour_data

Removing 282 patients as their evaluation was incomplete.
Removing 73 patients as their diagnoses were very uncommon.


Unnamed: 0,Anonymized.ID,ID,Sex,Age,Study.Site,NoDX,DX_01_Cat,DX_01_Sub,DX_01,DX_01_Spec,...,WISC_PSI,WISC_FSIQ,YSR_AB,YSR_AD,YSR_WD,YSR_RBB,YSR_SC,YSR_Ext,YSR_Int,YSR_Total
0,A00078864,NDARYM832PX3,1,7.048254,1,Yes,Anxiety Disorders,,Generalized Anxiety Disorder,,...,,,,,,,,,,
1,A00078865,NDARNJ687DMC,1,6.348163,1,Yes,Depressive Disorders,,Disruptive Mood Dysregulation Disorder,,...,,,,,,,,,,
2,A00078866,NDARRM363BXZ,0,10.052589,1,Yes,Neurodevelopmental Disorders,Attention-Deficit/Hyperactivity Disorder,ADHD-Combined Type,,...,,,,,,,,,,
3,A00078867,NDARUW586LLL,1,12.319415,1,Yes,Depressive Disorders,,Major Depressive Disorder,Single episode,...,,,9.0,12.0,8.0,7.0,9.0,16.0,29.0,85.0
4,A00078868,NDARDC298NW4,0,13.901437,1,Yes,Neurodevelopmental Disorders,Intellectual Disability,Intellectual Disability-Mild,,...,,,8.0,10.0,5.0,2.0,11.0,10.0,26.0,70.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1736,A00093552,NDARJX939UCQ,1,10.203057,1,Yes,Neurodevelopmental Disorders,Attention-Deficit/Hyperactivity Disorder,ADHD-Combined Type,,...,103.0,77.0,,,,,,,,
1737,A00093553,NDARJJ817UP1,0,9.126397,3,Yes,Neurodevelopmental Disorders,Specific Learning Disorder,Specific Learning Disorder with Impairment in ...,,...,100.0,103.0,,,,,,,,
1738,A00093557,NDARYZ986HEW,1,7.563084,3,Yes,Neurodevelopmental Disorders,Attention-Deficit/Hyperactivity Disorder,ADHD-Combined Type,,...,116.0,129.0,,,,,,,,
1739,A00093558,NDARPM572ZZV,0,9.832762,3,Yes,Neurodevelopmental Disorders,Specific Learning Disorder,Specific Learning Disorder with Impairment in ...,rule out,...,98.0,112.0,,,,,,,,


In [7]:
classes = np.zeros((len(most_common_disorders), behaviour_data.shape[0]), dtype=np.int32)

df_disorders = behaviour_data[category_columns]

for i, disorder in enumerate(most_common_disorders):
    mask = df_disorders.select_dtypes(include=[object]). \
            applymap(lambda x: disorder in x if pd.notnull(x) else False)
    
    disorder_df = df_disorders[mask.any(axis=1)]
    
    np.add.at(classes[i], disorder_df.index.values, 1)

classes = np.transpose(classes)
classes = np.column_stack((behaviour_data['ID'], classes))

labels = pd.DataFrame(data=classes, columns=['ID'] + most_common_disorders)
labels

Unnamed: 0,ID,Attention-Deficit/Hyperactivity Disorder,Anxiety Disorders,Specific Learning Disorder,Autism Spectrum Disorder,Disruptive,No Diagnosis Given,Communication Disorder,Depressive Disorders
0,NDARYM832PX3,0,1,1,0,0,0,0,0
1,NDARNJ687DMC,0,0,0,0,0,0,0,1
2,NDARRM363BXZ,1,0,0,0,0,0,0,0
3,NDARUW586LLL,0,0,0,0,0,0,0,1
4,NDARDC298NW4,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
1736,NDARJX939UCQ,1,0,0,0,1,0,1,0
1737,NDARJJ817UP1,0,0,1,0,0,0,0,0
1738,NDARYZ986HEW,1,1,0,0,1,0,0,0
1739,NDARPM572ZZV,0,0,1,0,0,0,0,0


# FA Per Tract 

In [8]:
fa_per_tract = pd.read_csv('data/MRI/MRI/DTI/FAPerTract.csv', low_memory=False)

# Remove "/" from the end some IDs 
fa_per_tract['ID'] = fa_per_tract['ID'].apply(lambda x: x[:-1] if "/" in x else x)

# Keep patients who have both behavioural records and MRI
fa_patient_IDs = np.array(list(set(behaviour_data['ID'].values) & set(fa_per_tract['ID'].values)))
fa_per_tract = fa_per_tract[fa_per_tract['ID'].isin(fa_patient_IDs)].sort_values(by='ID')
labels_fa_tr = labels[labels['ID'].isin(fa_patient_IDs)].sort_values(by='ID')

print('Number of patients: ', len(fa_patient_IDs))

# Convert site to integer ID
scan_sites = fa_per_tract['ScanSite'].unique()
map_site_to_ID = {site: i for i, site in enumerate(scan_sites)}
fa_per_tract['ScanSite'] = fa_per_tract['ScanSite'].apply(lambda x: map_site_to_ID[x])

# Add Sex and Age to features 
fa_per_tract = pd.merge(
                fa_per_tract, 
                behaviour_data[behaviour_data['ID'].isin(fa_patient_IDs)].sort_values(by='ID')[['ID','Sex', 'Age']],
                on='ID',
                how='inner')

fa_per_tract.set_index('ID', inplace=True)
labels_fa_tr.set_index('ID', inplace=True)

Number of patients:  801


In [9]:
fa_per_tract

Unnamed: 0_level_0,LeftThalamicRadiation,RightThalamicRadiation,LeftCorticospinal,RightCorticospinal,LeftCingulumCingulate,RightCingulumCingulate,LeftCingulumHippocampus,RightCingulumHippocampus,CallosumForcepsMajor,CallosumForcepsMinor,...,LeftSLF,RightSLF,LeftUncinate,RightUncinate,LeftArcuate,RightArcuate,GlobalFA,ScanSite,Sex,Age
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
NDARAA536PTU,0.455656,0.453115,0.620558,0.618361,0.517805,0.444551,,0.297005,,0.574576,...,0.502264,0.518771,0.368745,0.423124,0.561095,0.470981,0.454698,0,0,11.998402
NDARAA948VFH,0.440407,0.420813,0.553176,0.566157,0.442870,0.462149,,,0.534137,0.512610,...,0.457156,0.436721,0.394274,0.419798,0.491984,0.424049,0.439574,1,1,7.982660
NDARAC349YUC,0.360179,0.400220,0.480111,0.499736,0.408975,0.406019,0.352879,0.291959,0.503494,0.524352,...,0.430824,0.481923,0.367575,0.405958,0.484720,0.485776,0.404561,2,1,10.051791
NDARAC853DTE,0.412105,0.428844,0.578922,0.594946,,,,,,0.489935,...,0.352073,0.380953,0.387661,0.449428,0.376423,0.435387,0.406589,1,0,10.227469
NDARAC857HDB,0.287333,,0.367654,0.330376,,,,,,0.432363,...,0.421391,0.344268,0.368084,0.514472,0.421558,,0.356337,2,0,6.975017
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
NDARZW472CCF,0.445741,0.496935,0.520404,0.497049,,,,,0.606643,0.566425,...,0.349277,0.411196,0.400458,0.405271,0.452404,0.447451,0.428610,1,0,9.572781
NDARZW873DN3,,,,,0.319956,,,,,,...,0.413686,0.438971,,0.325042,0.479818,0.426632,0.365432,2,1,13.436344
NDARZW930MF2,0.399457,0.383633,0.526873,0.498594,0.340887,,,0.409829,0.570211,0.548486,...,0.398634,0.422165,0.397124,0.444051,0.501421,0.461800,0.417440,1,0,10.156399
NDARZY101JNB,,,,,0.344458,,,,,,...,0.298890,0.347665,,,,0.363475,0.299453,2,0,7.517111


In [10]:
labels_fa_tr

Unnamed: 0_level_0,Attention-Deficit/Hyperactivity Disorder,Anxiety Disorders,Specific Learning Disorder,Autism Spectrum Disorder,Disruptive,No Diagnosis Given,Communication Disorder,Depressive Disorders
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
NDARAA536PTU,1,0,0,0,0,0,0,0
NDARAA948VFH,1,0,0,0,0,0,0,0
NDARAC349YUC,1,0,0,0,0,0,0,0
NDARAC853DTE,1,1,0,0,0,0,0,0
NDARAC857HDB,1,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...
NDARZW472CCF,1,0,0,0,0,0,0,0
NDARZW873DN3,0,1,0,0,0,0,1,0
NDARZW930MF2,0,1,0,0,0,0,0,0
NDARZY101JNB,0,0,0,1,0,0,0,0


# Structural MRI 

In [11]:
cort_thick_l = pd.read_csv('data/MRI/MRI/structuralMRI/CorticalThicknessLHROI.csv', low_memory=False)
cort_thick_r = pd.read_csv('data/MRI/MRI/structuralMRI/CorticalThicknessRHROI.csv', low_memory=False)
cort_vol_l = pd.read_csv('data/MRI/MRI/structuralMRI/CorticalVolumeLHROI.csv', low_memory=False)
cort_vol_r = pd.read_csv('data/MRI/MRI/structuralMRI/CorticalVolumeRHROI.csv', low_memory=False)
sub_cort_vol_l = pd.read_csv('data/MRI/MRI/structuralMRI/SubCorticalVolumeLHROI.csv', low_memory=False)
sub_cort_vol_r = pd.read_csv('data/MRI/MRI/structuralMRI/SubCorticalVolumeRHROI.csv', low_memory=False)
glob_thick = pd.read_csv('data/MRI/MRI/structuralMRI/GlobalCorticalThickness.csv', low_memory=False)

# Drop duplicated columns 
cort_thick_r = cort_thick_r.drop(columns=['eTIV', 'ScanSite'])
cort_vol_l = cort_vol_l.drop(columns=['eTIV', 'ScanSite'])
cort_vol_r = cort_vol_r.drop(columns=['eTIV', 'ScanSite'])
sub_cort_vol_l = sub_cort_vol_l.drop(columns=['eTIV', 'ScanSite'])
sub_cort_vol_r = sub_cort_vol_r.drop(columns=['eTIV', 'ScanSite'])
glob_thick = glob_thick.drop(columns=['ScanSite'])

# Join tables 
struct_mri = pd.merge(cort_thick_l, cort_thick_r, on='ID', how='inner')
struct_mri = pd.merge(struct_mri, cort_vol_l, on='ID', how='inner')
struct_mri = pd.merge(struct_mri, cort_vol_r, on='ID', how='inner')
struct_mri = pd.merge(struct_mri, sub_cort_vol_l, on='ID', how='inner')
struct_mri = pd.merge(struct_mri, sub_cort_vol_r, on='ID', how='inner')
struct_mri = pd.merge(struct_mri, glob_thick, on='ID', how='inner')

# Remove "/" from the end some IDs 
struct_mri['ID'] = struct_mri['ID'].apply(lambda x: x[:-1] if "/" in x else x)

# Keep patients who have both behavioural records and MRI
struct_patient_IDs = np.array(list(set(behaviour_data['ID'].values) & set(struct_mri['ID'].values)))
struct_mri = struct_mri[struct_mri['ID'].isin(struct_patient_IDs)].sort_values(by='ID')
labels_str = labels[labels['ID'].isin(struct_patient_IDs)].sort_values(by='ID')

print('Number of patients: ', len(struct_patient_IDs))

# Add Sex and Age to features 
struct_mri = pd.merge(
                struct_mri, 
                behaviour_data[behaviour_data['ID'].isin(struct_patient_IDs)].sort_values(by='ID')[['ID','Sex', 'Age']],
                on='ID',
                how='inner')

# Check that IDs from the features and labels tables match
if any(struct_mri['ID'].values != labels_str['ID'].values):
    raise ValueError('There is a mismatch in IDs!')
    
struct_mri.set_index('ID', inplace=True)
labels_str.set_index('ID', inplace=True)

# Convert site to integer ID
scan_sites = struct_mri['ScanSite'].unique()
map_site_to_ID = {site: i for i, site in enumerate(scan_sites)}
struct_mri['ScanSite'] = struct_mri['ScanSite'].apply(lambda x: map_site_to_ID[x])

Number of patients:  1027


In [12]:
struct_mri

Unnamed: 0_level_0,lh_G.S_frontomargin_thickness,lh_G.S_occipital_inf_thickness,lh_G.S_paracentral_thickness,lh_G.S_subcentral_thickness,lh_G.S_transv_frontopol_thickness,lh_G.S_cingul.Ant_thickness,lh_G.S_cingul.Mid.Ant_thickness,lh_G.S_cingul.Mid.Post_thickness,lh_G_cingul.Post.dorsal_thickness,lh_G_cingul.Post.ventral_thickness,...,rh_superiortemporal_volume,rh_supramarginal_volume,rh_frontalpole_volume,rh_temporalpole_volume,rh_transversetemporal_volume,rh_insula_volume,BrainSegVolNotVent_y,GlobalCorticalThickness,Sex,Age
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
NDARAA075AMK,2.683,2.715,2.422,2.856,3.240,2.978,2.887,2.779,3.188,2.774,...,14267,12552,1788,2667,1038,6893,1163044,2.803030,1,6.728040
NDARAA536PTU,2.432,2.871,2.536,2.790,2.782,2.971,2.832,2.994,2.932,2.729,...,16053,11576,1465,3062,1406,9631,1203675,2.712315,0,11.998402
NDARAA948VFH,2.750,2.679,2.703,3.432,3.246,3.391,3.161,3.089,3.480,3.149,...,13756,11335,1461,2682,1095,6799,1107657,2.928645,1,7.982660
NDARAC349YUC,2.392,2.941,2.570,2.612,2.556,3.279,3.056,3.232,3.388,3.365,...,12823,12948,945,2630,1126,8154,1154134,2.745770,1,10.051791
NDARAC350XUM,2.724,2.256,2.385,2.450,3.108,2.956,3.109,2.789,2.832,3.067,...,12067,12013,1891,2202,953,8739,1275317,2.498120,0,8.211955
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
NDARZW873DN3,2.710,2.551,3.130,3.149,2.836,3.188,3.201,3.137,3.276,3.069,...,13675,11022,1386,2710,965,8743,1276325,2.820720,1,13.436344
NDARZW930MF2,2.632,2.500,2.210,2.810,2.603,3.109,3.043,2.965,3.284,3.073,...,12707,10961,1323,3052,1216,7754,1164539,2.706000,0,10.156399
NDARZY101JNB,2.864,2.867,2.924,3.173,3.009,3.208,3.229,2.997,3.521,3.252,...,15479,15710,1405,2802,1217,8770,1289003,2.996145,0,7.517111
NDARZY668NMV,2.849,2.659,2.638,2.952,3.304,3.135,3.225,2.897,3.032,3.111,...,11910,12836,1430,2626,934,6611,1008736,2.717490,1,11.623431


In [13]:
labels_str

Unnamed: 0_level_0,Attention-Deficit/Hyperactivity Disorder,Anxiety Disorders,Specific Learning Disorder,Autism Spectrum Disorder,Disruptive,No Diagnosis Given,Communication Disorder,Depressive Disorders
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
NDARAA075AMK,0,0,0,0,0,1,0,0
NDARAA536PTU,1,0,0,0,0,0,0,0
NDARAA948VFH,1,0,0,0,0,0,0,0
NDARAC349YUC,1,0,0,0,0,0,0,0
NDARAC350XUM,0,1,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...
NDARZW873DN3,0,1,0,0,0,0,1,0
NDARZW930MF2,0,1,0,0,0,0,0,0
NDARZY101JNB,0,0,0,1,0,0,0,0
NDARZY668NMV,1,0,0,0,1,0,0,0


# Combined MRI 

In [14]:
combined_mri_patient_IDs = list(set(fa_per_tract.index.values.tolist()) & set(struct_mri.index.values.tolist()))
print('Number of patients having both MRI records: ', len(combined_mri_patient_IDs))

Number of patients having both MRI records:  747


In [15]:
struct_mri.reset_index(level=0, inplace=True)
labels_str.reset_index(level=0, inplace=True)
fa_per_tract.reset_index(level=0, inplace=True)
labels_fa_tr.reset_index(level=0, inplace=True)

# Consider only intersection 

# Structural MRI 
struct_mri_reduced = struct_mri[struct_mri['ID'].isin(combined_mri_patient_IDs)].sort_values(by='ID')
labels_str_reduced = labels_str[labels_str['ID'].isin(combined_mri_patient_IDs)].sort_values(by='ID')

# FA Per Tract 
fa_per_tract_reduced = fa_per_tract[fa_per_tract['ID'].isin(combined_mri_patient_IDs)].sort_values(by='ID')
labels_fa_tr_reduced = labels_fa_tr[labels_fa_tr['ID'].isin(combined_mri_patient_IDs)].sort_values(by='ID')

# Join Tables 
combined_mri = pd.merge(fa_per_tract_reduced, struct_mri_reduced, on='ID', how='inner')
combined_mri_labels = labels_str_reduced

combined_mri.set_index('ID', inplace=True)
combined_mri_labels.set_index('ID', inplace=True)

In [16]:
# Reset index for datasets 
fa_per_tract.set_index('ID', inplace=True)
labels_fa_tr.set_index('ID', inplace=True)

struct_mri.set_index('ID', inplace=True)
labels_str.set_index('ID', inplace=True)

# Disorder vs Rest

In [18]:
import pickle

# load it again
with open('data/cross_validation_splits.pkl', 'rb') as fid:
    cross_validation_splits = pickle.load(fid)
    

    N_rep, N_folds = cross_validation_splits.shape

In [19]:
# If true, each model will be trained/evaluated using only the subjects that have records for all datasets 
train_test_on_intersection = False 

if train_test_on_intersection: 
    train_mode = 'Intersection'
else: 
    train_mode = 'Everything'

## FA Per Track 

In [25]:
fa_per_tract_res = {}

### Random Forest 

In [None]:
rf_param_grid = {'n_estimators': [100], 
                 'max_depth': np.linspace(start=4,stop=20,num=5),
                 'max_features': np.linspace(start=0.2,stop=1,num=5),
                 'n_jobs': [-1], 
                 'class_weight': ['balanced']}

scalers = {disorder: RobustScaler() for disorder in most_common_disorders}

results = []

for i in range(N_rep): 

    rf_tract_res_vs_rest, rf_tract_y_pred_vs_rest = binary_classification(fa_per_tract, 
                                                                          labels_fa_tr, 
                                                                          cross_validation_splits[i],
                                                                          most_common_disorders, 
                                                                          scalers,
                                                                          RandomForestClassifier(), 
                                                                          rf_param_grid,
                                                                          scale_features=False,
                                                                          scenario='Disorder vs Rest',
                                                                          train_mode=train_mode)
    
    results.append(rf_tract_res_vs_rest)
    
fa_per_tract_res['rf'] = results

### SVM

In [45]:
svm_param_grid = {'C': [0.1, 1, 10], 
                  'kernel': ['rbf'],
                  'gamma': ['scale', 'auto'],
                  'class_weight': ['balanced']}

scalers = {}

for disorder in most_common_disorders: 
    if disorder in ['Anxiety Disorders', 'Specific Learning Disorder', 'Communication Disorder']: 
        scalers[disorder] = RobustScaler()
    else: 
        scalers[disorder] = MinMaxScaler()
    
results = []

for i in range(N_rep): 

    svm_tract_res_vs_rest, svm_tract_y_pred_vs_rest = binary_classification(fa_per_tract, 
                                                                            labels_fa_tr, 
                                                                            cross_validation_splits[i],
                                                                            most_common_disorders, 
                                                                            scalers,
                                                                            SVC(), 
                                                                            svm_param_grid,
                                                                            scale_features=True,
                                                                            scenario='Disorder vs Rest',
                                                                            train_mode=train_mode)
    
    results.append(svm_tract_res_vs_rest)
    
fa_per_tract_res['svm'] = results

accuracy 0.540 balanced_accuracy 0.566 precision 0.676 recall 0.407 f1_score 0.508
accuracy 0.342 balanced_accuracy 0.510 precision 0.272 recall 0.870 f1_score 0.414
accuracy 0.312 balanced_accuracy 0.502 precision 0.159 recall 0.781 f1_score 0.265
accuracy 0.396 balanced_accuracy 0.568 precision 0.165 recall 0.807 f1_score 0.274
accuracy 0.376 balanced_accuracy 0.572 precision 0.185 recall 0.859 f1_score 0.304
accuracy 0.364 balanced_accuracy 0.535 precision 0.121 recall 0.756 f1_score 0.209
accuracy 0.562 balanced_accuracy 0.611 precision 0.161 recall 0.674 f1_score 0.259
accuracy 0.537 balanced_accuracy 0.563 precision 0.669 recall 0.411 f1_score 0.509
accuracy 0.361 balanced_accuracy 0.535 precision 0.283 recall 0.907 f1_score 0.432
accuracy 0.309 balanced_accuracy 0.469 precision 0.148 recall 0.703 f1_score 0.244
accuracy 0.376 balanced_accuracy 0.498 precision 0.140 recall 0.667 f1_score 0.232
accuracy 0.379 balanced_accuracy 0.542 precision 0.174 recall 0.781 f1_score 0.285
accu

accuracy 0.374 balanced_accuracy 0.541 precision 0.123 recall 0.756 f1_score 0.212
accuracy 0.562 balanced_accuracy 0.535 precision 0.130 recall 0.500 f1_score 0.206
accuracy 0.525 balanced_accuracy 0.550 precision 0.653 recall 0.398 f1_score 0.495
accuracy 0.342 balanced_accuracy 0.515 precision 0.274 recall 0.889 f1_score 0.419
accuracy 0.327 balanced_accuracy 0.505 precision 0.160 recall 0.766 f1_score 0.265
accuracy 0.371 balanced_accuracy 0.524 precision 0.149 recall 0.737 f1_score 0.249
accuracy 0.344 balanced_accuracy 0.522 precision 0.166 recall 0.781 f1_score 0.274
accuracy 0.366 balanced_accuracy 0.537 precision 0.122 recall 0.756 f1_score 0.210
accuracy 0.562 balanced_accuracy 0.582 precision 0.150 recall 0.609 f1_score 0.240
accuracy 0.535 balanced_accuracy 0.560 precision 0.664 recall 0.411 f1_score 0.508
accuracy 0.347 balanced_accuracy 0.516 precision 0.275 recall 0.880 f1_score 0.419
accuracy 0.312 balanced_accuracy 0.490 precision 0.155 recall 0.750 f1_score 0.257
accu

In [None]:
# Save results 
if train_test_on_intesection: 
    with open('results_tuned/fa_per_tract_res_intersection.pickle', 'wb') as f:
        pickle.dump(fa_per_tract_res, f)
else: 
    with open('results_tuned/fa_per_tract_res_everything.pickle', 'wb') as f:
        pickle.dump(fa_per_tract_res, f)

## Structural MRI 

In [None]:
struct_mri_res = {}

### Random Forest

In [None]:
rf_param_grid = {'n_estimators': [100], 
                 'max_depth': np.linspace(start=4,stop=20,num=5),
                 'max_features': np.linspace(start=0.2,stop=1,num=5),
                 'n_jobs': [-1], 
                 'class_weight': ['balanced']}

scalers = {disorder: RobustScaler() for disorder in most_common_disorders}

results = []

for i in range(N_rep): 

    rf_struct_res_vs_rest, rf_struct_y_pred_vs_rest = binary_classification(struct_mri, 
                                                                            labels_str, 
                                                                            cross_validation_splits[i],
                                                                            most_common_disorders, 
                                                                            scalers,
                                                                            RandomForestClassifier(), 
                                                                            rf_param_grid,
                                                                            scale_features=False,
                                                                            scenario='Disorder vs Rest',
                                                                            train_mode=train_mode)
    
    results.append(rf_struct_res_vs_rest)

struct_mri_res['rf'] = results

### SVM 

In [None]:
svm_param_grid = {'C': [0.1, 1, 10], 
                  'kernel': ['rbf'],
                  'gamma': ['scale', 'auto'],
                  'class_weight': ['balanced']}

scalers = {}

for disorder in most_common_disorders: 
    if disorder in ['Autism Spectrum Disorder']: 
        scalers[disorder] = MinMaxScaler()
    else: 
        scalers[disorder] = RobustScaler()
        
results = []

for i in range(N_rep): 

    svm_struct_res_vs_rest, svm_struct_y_pred_vs_rest = binary_classification(struct_mri, 
                                                                              labels_str, 
                                                                              cross_validation_splits[i],
                                                                              most_common_disorders, 
                                                                              scalers,
                                                                              SVC(), 
                                                                              svm_param_grid,
                                                                              scale_features=True,
                                                                              scenario='Disorder vs Rest',
                                                                              train_mode=train_mode)
    
    results.append(svm_struct_res_vs_rest)

struct_mri_res['svm'] = results

In [None]:
# Save results 
if train_test_on_intesection: 
    with open('results_tuned/struct_mri_res_intersection.pickle', 'wb') as f:
        pickle.dump(struct_mri_res, f)
else: 
    with open('results_tuned/struct_mri_res_everything.pickle', 'wb') as f:
        pickle.dump(struct_mri_res, f)

## Combined MRI 

In [None]:
combined_mri_res = {}

In [None]:
rf_param_grid = {'n_estimators': [100], 
                 'max_depth': np.linspace(start=4,stop=20,num=5),
                 'max_features': np.linspace(start=0.2,stop=1,num=5),
                 'n_jobs': [-1], 
                 'class_weight': ['balanced']}

scalers = {disorder: RobustScaler() for disorder in most_common_disorders}

results = []

for i in range(N_rep): 

    rf_combined_res_vs_rest, rf_combined_y_pred_vs_rest = binary_classification(combined_mri, 
                                                                                combined_mri_labels, 
                                                                                cross_validation_splits[i],
                                                                                most_common_disorders, 
                                                                                scalers,
                                                                                RandomForestClassifier(), 
                                                                                rf_param_grid,
                                                                                scale_features=False,
                                                                                scenario='Disorder vs Rest',
                                                                                train_mode=train_mode)
    
    results.append(rf_combined_res_vs_rest)
    
combined_mri_res['rf'] = results

In [None]:
svm_param_grid = {'C': [0.1, 1, 10], 
                  'kernel': ['rbf'],
                  'gamma': ['scale', 'auto'],
                  'class_weight': ['balanced']}

scalers = {}

for disorder in most_common_disorders: 
    if disorder in ['Anxiety Disorders', 'Specific Learning Disorder', 'Communication Disorder']: 
        scalers[disorder] = RobustScaler()
    else: 
        scalers[disorder] = MinMaxScaler()
        
results = []

for i in range(N_rep): 

    svm_combined_res_vs_rest, svm_combined_y_pred_vs_rest = binary_classification(combined_mri, 
                                                                                  combined_mri_labels, 
                                                                                  cross_validation_splits[i],
                                                                                  most_common_disorders, 
                                                                                  scalers,
                                                                                  SVC(), 
                                                                                  svm_param_grid,
                                                                                  scale_features=True,
                                                                                  scenario='Disorder vs Rest',
                                                                                  train_mode=train_mode)
    
    results.append(svm_combined_res_vs_rest)

combined_mri_res['svm'] = results

In [None]:
if train_test_on_intersection:     
    with open('results_tuned/combined_mri_res_intersection.pickle', 'wb') as f:
        pickle.dump(combined_mri_res, f)
else: 
    with open('results_tuned/combined_mri_res_everything.pickle', 'wb') as f:
        pickle.dump(combined_mri_res, f)