In [1]:
import pandas as pd
import numpy as np
import random

In [2]:
LOCATION = '../input/harvard-atlas-schizophrenia/'

In [3]:
participants = pd.read_csv(LOCATION + 'participants.csv')
control = participants[participants['diagnosis'] == 'Control']['id'].to_list()
schizophrenic = participants[participants['diagnosis'] != 'Control']['id'].to_list()

len(control), len(schizophrenic)

(206, 130)

In [4]:
tr = int(len(control) * 0.8)
va = tr + int(len(control) * 0.1)

train_list = control[:tr]
valid_list = control[tr:va]
test_list = control[va:]

tr = int(len(schizophrenic) * 0.8)
va = tr + int(len(schizophrenic) * 0.1)

train_list = train_list + schizophrenic[:tr]
valid_list = valid_list + schizophrenic[tr:va]
test_list = test_list + schizophrenic[va:]

random.shuffle(train_list)
random.shuffle(valid_list)
random.shuffle(test_list)

len(train_list), len(valid_list), len(test_list)

(268, 33, 35)

In [5]:
train_x, train_y = np.empty((0, 9216)), np.empty(0, int)
test_x, test_y = np.empty((0, 9216)), np.empty(0, int)
valid_x, valid_y = np.empty((0, 9216)), np.empty(0, int)

In [6]:
for train in train_list:
    diagnosis = participants[participants['id'] == train].iloc[0].diagnosis
    path = LOCATION + 'connectome/' + diagnosis + '/sub-'+train+'.npz'
        
    x = np.load(path)['arr_0'].reshape(-1)
    y = 0 if diagnosis == 'Control' else 1

    train_x = np.append(train_x, [x], axis=0)
    train_y = np.append(train_y, [y], axis=0)
    
train_x.shape, train_y.shape

((268, 9216), (268,))

In [7]:
for valid in valid_list:
    diagnosis = participants[participants['id'] == valid].iloc[0].diagnosis
    path = LOCATION + 'connectome/' + diagnosis + '/sub-'+valid+'.npz'
        
    x = np.load(path)['arr_0'].reshape(-1)
    y = 0 if diagnosis == 'Control' else 1

    valid_x = np.append(valid_x, [x], axis=0)
    valid_y = np.append(valid_y, [y], axis=0)
    
valid_x.shape, valid_y.shape

((33, 9216), (33,))

In [8]:
for test in test_list:
    diagnosis = participants[participants['id'] == test].iloc[0].diagnosis
    path = LOCATION + 'connectome/' + diagnosis + '/sub-'+test+'.npz'
        
    x = np.load(path)['arr_0'].reshape(-1)
    y = 0 if diagnosis == 'Control' else 1

    test_x = np.append(test_x, [x], axis=0)
    test_y = np.append(test_y, [y], axis=0)
    
test_x.shape, test_y.shape

((35, 9216), (35,))

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

In [10]:
params_grid_rf = {'n_estimators': [200, 500],
                  'max_features': ['auto', 'sqrt', 'log2'],
                  'max_depth' : [4, 5, 6, 7, 8],
                  'criterion' :['gini', 'entropy']}

In [11]:
random_forest_classifier = RandomForestClassifier()
grid_search_rf = GridSearchCV(estimator=random_forest_classifier, param_grid=params_grid_rf, cv=5)

In [12]:
grid_search_rf.fit(train_x, train_y)

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [4, 5, 6, 7, 8],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'n_estimators': [200, 500]})

In [13]:
grid_search_rf.best_params_

{'criterion': 'entropy',
 'max_depth': 6,
 'max_features': 'auto',
 'n_estimators': 500}

In [14]:
clf_rf = RandomForestClassifier(n_estimators=500,
                                max_features='auto',
                                max_depth=6,
                                criterion='entropy')
clf_rf = clf_rf.fit(train_x, train_y)

In [15]:
pred_y = clf_rf.predict(train_x)

print("Accuracy:", metrics.accuracy_score(train_y, pred_y))
print("Confusion:\n", metrics.confusion_matrix(train_y, pred_y))
print(metrics.classification_report(train_y, pred_y))

Accuracy: 1.0
Confusion:
 [[164   0]
 [  0 104]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       164
           1       1.00      1.00      1.00       104

    accuracy                           1.00       268
   macro avg       1.00      1.00      1.00       268
weighted avg       1.00      1.00      1.00       268



In [16]:
pred_y = clf_rf.predict(valid_x)

print("Accuracy:", metrics.accuracy_score(valid_y, pred_y))
print("Confusion:\n", metrics.confusion_matrix(valid_y, pred_y))
print(metrics.classification_report(valid_y, pred_y))

Accuracy: 0.6666666666666666
Confusion:
 [[14  6]
 [ 5  8]]
              precision    recall  f1-score   support

           0       0.74      0.70      0.72        20
           1       0.57      0.62      0.59        13

    accuracy                           0.67        33
   macro avg       0.65      0.66      0.66        33
weighted avg       0.67      0.67      0.67        33



In [17]:
pred_y = clf_rf.predict(test_x)

print("Accuracy:", metrics.accuracy_score(test_y, pred_y))
print("Confusion:\n", metrics.confusion_matrix(test_y, pred_y))
print(metrics.classification_report(test_y, pred_y))

Accuracy: 0.6857142857142857
Confusion:
 [[19  3]
 [ 8  5]]
              precision    recall  f1-score   support

           0       0.70      0.86      0.78        22
           1       0.62      0.38      0.48        13

    accuracy                           0.69        35
   macro avg       0.66      0.62      0.63        35
weighted avg       0.67      0.69      0.66        35

