### Data format

In [121]:
import pandas as pd
import numpy as np
import os

In [133]:
base_path = './DataScience2019_MRI/'

behaviour_path = 'Behavioral/cleaned'
behaviour_file = 'HBNFinalSummaries.csv'

In [134]:
behaviour_data = pd.read_csv(os.path.join(base_path, behaviour_path, behaviour_file), low_memory=False)

In [135]:
print(behaviour_data.shape)

(2096, 399)


A total of 2096 subjects are provided with a total of 399 feature columns. From those around 150 features regard the diagnosis given (15 for each of the 10 diagnosis).

We now filter out subjects that have no diagnosis given (incomplete evaluation)

In [136]:
behaviour_data = behaviour_data[behaviour_data['NoDX'].isin(['Yes', 'No'])]
behaviour_data = behaviour_data.reset_index(drop=True)

In [137]:
behaviour_data['DX_01_Cat'].value_counts()

Neurodevelopmental Disorders                                  1167
No Diagnosis Given                                             235
Anxiety Disorders                                              189
Depressive Disorders                                            94
Disruptive                                                      40
Trauma and Stressor Related Disorders                           35
Obsessive Compulsive and Related Disorders                      16
Elimination Disorders                                           11
Other Conditions That May Be a Focus of Clinical Attention       8
Bipolar and Related Disorders                                    6
Disruptive, Impulse Control and Conduct Disorders                5
Neurocognitive Disorders                                         3
Schizophrenia Spectrum and other Psychotic Disorders             3
Gender Dysphoria                                                 1
Substance Related and Addictive Disorders                     

We could also get rid of disorder categories that can be found less than a $threshold$ amount of times. We perform this taking into acount all possible diagnosis for each subject and not only the first one (DX_01_Cat).

In [138]:
threshold = 10

category_columns = ['DX_' + str(i).zfill(2) + '_Cat' for i in range(1, 11)]

disorders_all = behaviour_data[category_columns].values.flatten()
disorders_all = disorders[np.array(disorders, dtype=np.str) != 'nan']

unique, counts = np.unique(disorders_all, return_counts=True)

most_common_disorders = []
for un, c in zip(unique, counts):
    if c >= threshold:
        most_common_disorders.append(un)
        print('{0: <60} found {1} times'.format(un, c))

Anxiety Disorders                                            found 596 times
Bipolar and Related Disorders                                found 10 times
Depressive Disorders                                         found 179 times
Disruptive                                                   found 240 times
Disruptive, Impulse Control and Conduct Disorders            found 35 times
Elimination Disorders                                        found 96 times
Feeding and Eating Disorders                                 found 10 times
Neurodevelopmental Disorders                                 found 2081 times
No Diagnosis Given                                           found 235 times
Obsessive Compulsive and Related Disorders                   found 74 times
Other Conditions That May Be a Focus of Clinical Attention   found 23 times
Schizophrenia Spectrum and other Psychotic Disorders         found 10 times
Trauma and Stressor Related Disorders                        found 77 times


We now create for each individual features concerning these diagnosis disregarding all other information about the diagnosis

In [139]:
classes = np.zeros((len(most_common_disorders), behaviour_data.shape[0]))
df_disorders = behaviour_data[category_columns]

for i, disorder in enumerate(most_common_disorders):
    mask = df_disorders.select_dtypes(include=[object]). \
            applymap(lambda x: disorder in x if pd.notnull(x) else False)
    
    disorder_df = df_disorders[mask.any(axis=1)]
    
    np.add.at(classes[i], disorder_df.index.values, 1)

Now we can safelly remove previous diagnoses

In [140]:
behaviour_data_columns = behaviour_data.columns.values.astype(np.str)

columns_to_drop = behaviour_data_columns[
    np.flatnonzero(np.core.defchararray.find(behaviour_data_columns, 'DX')!=-1)]

behaviour_data = behaviour_data.drop(columns=columns_to_drop)

In [141]:
for disorder, classification in zip(most_common_disorders, classes):
    behaviour_data[disorder] = classification

In [142]:
behaviour_data.shape

(1814, 261)

In [31]:
nans = pd.isnull(behaviour_data).sum()

nans = nans.filter(like='DX', axis=0)

In [34]:
nans.index.values

array(['NoDX', 'DX_01_Cat', 'DX_01_Sub', 'DX_01', 'DX_01_Spec',
       'DX_01_Code', 'DX_01_Time', 'DX_01_Confirmed', 'DX_01_Presum',
       'DX_01_RC', 'DX_01_RuleOut', 'DX_01_ByHx', 'DX_01_New',
       'DX_01_Rem', 'DX_01_PRem', 'DX_01_Past_Doc', 'DX_02_Cat',
       'DX_02_Sub', 'DX_02', 'DX_02_Spec', 'DX_02_Code', 'DX_02_Time',
       'DX_02_Confirmed', 'DX_02_Presum', 'DX_02_RC', 'DX_02_RuleOut',
       'DX_02_ByHx', 'DX_02_New', 'DX_02_Rem', 'DX_02_PRem',
       'DX_02_Past_Doc', 'DX_03_Cat', 'DX_03_Sub', 'DX_03', 'DX_03_Spec',
       'DX_03_Code', 'DX_03_Time', 'DX_03_Confirmed', 'DX_03_Presum',
       'DX_03_RC', 'DX_03_RuleOut', 'DX_03_ByHx', 'DX_03_New',
       'DX_03_Rem', 'DX_03_PRem', 'DX_03_Past_Doc', 'DX_04_Cat',
       'DX_04_Sub', 'DX_04', 'DX_04_Spec', 'DX_04_Code', 'DX_04_Time',
       'DX_04_Confirmed', 'DX_04_Presum', 'DX_04_RC', 'DX_04_RuleOut',
       'DX_04_ByHx', 'DX_04_New', 'DX_04_Rem', 'DX_04_PRem',
       'DX_04_Past_Doc', 'DX_05_Cat', 'DX_05_Sub', 'DX_05', 