In [57]:
import numpy as np
from joblib import load
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report

In [2]:
class MyMinMax:
    def __init__(self, axis):
        self.sc = MinMaxScaler()
        self.axis = axis

    def fit(self, X):
        if self.axis==1:
            self.sc = self.sc.fit(X.transpose())
        elif self.axis==0:
            self.sc = self.sc.fit(X)
        return self.sc

    def transform(self, X):
        if self.axis==1:
            Xn = self.sc.transform(X.transpose()).transpose()
        elif self.axis==0:
            Xn = self.sc.transform(X)
        return Xn

    def fit_transform(self, X):
        if self.axis==1:
            self.sc = self.sc.fit(X.transpose())
            Xn = self.sc.transform(X.transpose()).transpose()
        elif self.axis==0:
            self.sc = self.sc.fit(X)
            Xn = self.sc.transform(X)
        return Xn

In [None]:
def mynormalize(df, allfeats=False):
    scalersdict = {}
    if allfeats:
        sc = MyMinMax(axis=1)
        XN = sc.fit_transform(df.values)
        scalersdict['allfeat'] = sc
    else:
        morph_feats = ['area', 'curv', 'thickness', 'volume']
        XN = np.array([], dtype=np.double)
        for ind, morph_feat in enumerate(morph_feats):
            morph_cols = [col for col in df.columns if morph_feat in col]
            X_morph = df.loc[:, morph_cols].values
            Xn = (X_morph-np.min(X_morph, axis=1).reshape(-1,1))/(np.max(X_morph, axis=1).reshape(-1,1)-np.min(X_morph, axis=1).reshape(-1,1))
            if ind == 0:
                XN = np.append(XN, Xn).reshape(Xn.shape[0], -1)
            else:
                XN = np.concatenate([XN, Xn], axis=1)
    return XN


## Goal of this notebook:
For each normalization method:<br>
$\;\;\;\;\;$ For each RFE classifier core:<br>
$\;\;\;\;\;$ $\;\;\;\;\;$ For each data matrix (corr; uncorr; ucorrleft; ucorrright):<br>
$\;\;\;\;\;$ $\;\;\;\;\;$ $\;\;\;\;\;$ 1. Find the classifier with highest performance <br>
$\;\;\;\;\;$ $\;\;\;\;\;$ $\;\;\;\;\;$ 2. Use this classifier to train on all the training set<br>
$\;\;\;\;\;$ $\;\;\;\;\;$ $\;\;\;\;\;$ 3. Measure the performance on the testing set<br><br>

Measure the performance in the testing set:
1. Load the testing set
2. Get the normalization object corresponding to the current normalization method
3. Normalize the testing set using the normalization object of the training set
4. Load the rfe+(RFE classifier core)
5. Get the selected features used for learning the best ML model
6. Select those features out of the normalized testing set
7. Predict the labels of the output matrix from step 6



In [75]:
# clf = load('./Final_Results/ML/clf_lg2_train.joblib')
clf_lab = load('./Final_Results/ML/ML/clf_lg2_train.joblib')

In [76]:
# print(clf['lSVM'].best_score_)
# print(clf['pagg'].best_score_)
# print(clf['lg'].best_score_)
# print(clf['XGB'].best_score_)
# print(clf['GNB'].best_score_)
# print(clf['SVC'].best_score_)
# print(clf['Rf'].best_score_)
# print(clf['nn'].best_score_)

print('00000000000000000000000000000000000000')
print(clf_lab['lSVM'].best_score_)
print(clf_lab['pagg'].best_score_)
print(clf_lab['lg'].best_score_)
print(clf_lab['XGB'].best_score_)
print(clf_lab['GNB'].best_score_)
print(clf_lab['SVC'].best_score_)
print(clf_lab['Rf'].best_score_)
print(clf_lab['nn'].best_score_)

00000000000000000000000000000000000000
0.5912429378531072
0.5410169491525424
0.5996610169491525
0.603050847457627
0.5720621468926554
0.6112146892655368
0.5760169491525424
0.5924858757062147


In [66]:
# clf = load('./Final_Results/ML/clf_lg2_train_corr.joblib')
clf_lab = load('./Final_Results/ML/ML/clf_lg2_train_corr.joblib')

In [68]:
# print(clf['lSVM'].best_score_)
# print(clf['pagg'].best_score_)
# print(clf['lg'].best_score_)
# print(clf['XGB'].best_score_)
# print(clf['GNB'].best_score_)
# print(clf['SVC'].best_score_)
# print(clf['Rf'].best_score_)
# print(clf['nn'].best_score_)
print('00000000000000000000000000000000000000')
print(clf_lab['lSVM'].best_score_)
print(clf_lab['pagg'].best_score_)
print(clf_lab['lg'].best_score_)
print(clf_lab['XGB'].best_score_)
print(clf_lab['GNB'].best_score_)
print(clf_lab['SVC'].best_score_)
print(clf_lab['Rf'].best_score_)
print(clf_lab['nn'].best_score_)

00000000000000000000000000000000000000
0.5843785310734464
0.5522881355932203
0.5878813559322034
0.5790677966101695
0.5330508474576272
0.5875706214689266
0.5709887005649718
0.6096892655367231


In [71]:
# clf = load('./Final_Results/ML/clf_lg2_train_corr_l.joblib')
clf_lab = load('./Final_Results/ML/ML/clf_lg2_train_corr_l.joblib')

In [72]:
# print(clf['lSVM'].best_score_)
# print(clf['pagg'].best_score_)
# print(clf['lg'].best_score_)
# print(clf['XGB'].best_score_)
# print(clf['GNB'].best_score_)
# print(clf['SVC'].best_score_)
# print(clf['Rf'].best_score_)
# print(clf['nn'].best_score_)
print('00000000000000000000000000000000000000')
print(clf_lab['lSVM'].best_score_)
print(clf_lab['pagg'].best_score_)
print(clf_lab['lg'].best_score_)
print(clf_lab['XGB'].best_score_)
print(clf_lab['GNB'].best_score_)
print(clf_lab['SVC'].best_score_)
print(clf_lab['Rf'].best_score_)
print(clf_lab['nn'].best_score_)

00000000000000000000000000000000000000
0.5674858757062147
0.5250564971751412
0.5673728813559322
0.5760734463276835
0.5350282485875706
0.5824858757062147
0.5759887005649718
0.5878813559322034


In [73]:
# clf = load('./Final_Results/ML/clf_lg1_train_corr_r.joblib')
clf_lab = load('./Final_Results/ML/ML/clf_lg1_train_corr_r.joblib')

In [74]:
# print(clf['lSVM'].best_score_)
# print(clf['pagg'].best_score_)
# print(clf['lg'].best_score_)
# print(clf['XGB'].best_score_)
# print(clf['GNB'].best_score_)
# print(clf['SVC'].best_score_)
# print(clf['Rf'].best_score_)
# print(clf['nn'].best_score_)
print('00000000000000000000000000000000000000')
print(clf_lab['lSVM'].best_score_)
print(clf_lab['pagg'].best_score_)
print(clf_lab['lg'].best_score_)
print(clf_lab['XGB'].best_score_)
print(clf_lab['GNB'].best_score_)
print(clf_lab['SVC'].best_score_)
print(clf_lab['Rf'].best_score_)
print(clf_lab['nn'].best_score_)

00000000000000000000000000000000000000
0.55954802259887
0.5349717514124294
0.5612146892655367
0.5947175141242937
0.5524858757062147
0.5775706214689266
0.5794350282485876
0.5833333333333334


Based on the current results, I am going to proceed with "lg1_train_corr", classifier XGB

In [77]:
selected_clc1 = load('./Final_Results/ML/ML/clf_lg2_train.joblib')['XGB']
selected_clc2 = load('./Final_Results/ML/ML/clf_lg2_train_corr.joblib')['nn']

print(selected_clc1.best_score_)
selected_clc1 = selected_clc1.best_estimator_

print(selected_clc2.best_score_)
selected_clc2 = selected_clc2.best_estimator_

0.603050847457627
0.6096892655367231


In [78]:
selected_clc1, selected_clc2

(XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
               colsample_bynode=1, colsample_bytree=0.8, gamma=1, gpu_id=-1,
               importance_type='gain', interaction_constraints='',
               learning_rate=0.5, max_delta_step=0, max_depth=6,
               min_child_weight=0.01, missing=nan, monotone_constraints='()',
               n_estimators=100, n_jobs=-1, num_parallel_tree=1, random_state=0,
               reg_alpha=0.001, reg_lambda=0, scale_pos_weight=1, subsample=1,
               tree_method='exact', validate_parameters=True, verbosity=None),
 MLPClassifier(activation='tanh', alpha=0.1, beta_1=0.5, beta_2=0.9,
               hidden_layer_sizes=(100, 50, 25), learning_rate='adaptive',
               max_iter=1000000))

In [79]:
# Load test dataset
df_test = pd.read_csv('./Final_Results/INITIAL_SPLIT/test_fullbrain.csv', index_col=0)
print(df_test.shape)
print(df_test['labels'].value_counts())

(67, 545)
0    36
1    31
Name: labels, dtype: int64


In [80]:
print('baseline score: ',36/(31+36))

baseline score:  0.5373134328358209


In [81]:
XN = mynormalize(df_test, allfeats=False)

In [83]:
# Load the corresponding rfe object
selected_rfe2 = load('./Final_Results/FS/rfetrain_corr_lg2.joblib')
selected_rfe1 = load('./Final_Results/FS/rfetrain_lg2.joblib')


Xtest1 = XN[:, np.where(selected_rfe1.support_)[0]]
Xtest2 = XN[:, np.where(selected_rfe2.support_)[0]]

In [84]:
# Load training dataset to train the current model using all training set
Xtrain1 = np.load('./Final_Results/FS/Xtrain_lg2.npy')
ytrain1 = np.load('./Final_Results/FS/ytrain.npy')

Xtrain.shape, ytrain.shape

((597, 11), (597,))

In [85]:
print(classification_report(df_test['labels'].values, selected_clc1.predict(Xtest)))
print("**************************************************************************")
print(classification_report(df_test['labels'].values, selected_clc2.predict(Xtest)))

ValueError: feature_names mismatch: ['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20', 'f21'] ['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10']
expected f18, f17, f11, f14, f12, f13, f16, f15, f20, f19, f21 in input data

In [61]:
selected_clc = selected_clc.fit(Xtrain, ytrain)
print(classification_report(ytrain, selected_clc.predict(Xtrain)))

              precision    recall  f1-score   support

           0       0.97      0.99      0.98       300
           1       0.99      0.97      0.98       297

    accuracy                           0.98       597
   macro avg       0.98      0.98      0.98       597
weighted avg       0.98      0.98      0.98       597



In [62]:
print(classification_report(df_test['labels'].values, selected_clc.predict(Xtest)))

              precision    recall  f1-score   support

           0       0.57      0.11      0.19        36
           1       0.47      0.90      0.62        31

    accuracy                           0.48        67
   macro avg       0.52      0.51      0.40        67
weighted avg       0.52      0.48      0.38        67

