In [393]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
# from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn. metrics import confusion_matrix, ConfusionMatrixDisplay , precision_score, recall_score, f1_score, r2_score, roc_curve, roc_auc_score, classification_report

### Subject id

In [394]:
# Run this cell when checking metrics for single subject
subjectid = '28'

In [395]:
feature_matrix = np.load(f"./data/hjorth-psd-wavelet-sub-{subjectid}.npy")

In [396]:
display(feature_matrix.shape)

(837, 130)

In [397]:
labels = feature_matrix[:, -1]
display(labels.shape)

(837,)

In [398]:
print((np.unique(labels)))

[0. 1.]


In [399]:
data = feature_matrix[:, :-1]
display(data.shape)
for i in range(data.shape[0]):
        for j in range(data.shape[1]):
            if np.isnan(data[i,j]):
                  display([i,j])

(837, 129)

In [400]:
fold = StratifiedKFold(n_splits=10)

In [401]:
for train_index, test_index in fold.split(data, labels):
    x_train, x_test, y_train, y_test = [data[i] for i in train_index], [data[i] for i in test_index], [labels[i] for i in train_index], [labels[i] for i in test_index]

In [402]:
print(len(np.unique(y_test)))

2


### Plot Confusion Matrix Function

In [403]:
def plot_cnf(cnf_matrix):
  fig = px.imshow(cnf_matrix, 
                 color_continuous_scale='Blues')

  fig.update_layout(
      title="Confusion Matrix with Rest-case-labeled: 0, One-back-labeled: 1",
      xaxis_title="Actual Labels",
      yaxis_title="Predicted Labels",
      width= 700,
      height=700,
  )

  fig.update_layout(
      font=dict(
          size=12
      ),
      xaxis = dict(
          tick0=0,
          dtick=1
      ),
      yaxis = dict(
          tick0=0,
          dtick=1
      )
  )

  for i in range(len(cnf_matrix)):
      for j in range(len(cnf_matrix)):
          
          if (cnf_matrix[i,j] >= 220):
              color = 'white'
          else: 
              color = 'black'

          fig.add_annotation(text=str(cnf_matrix[i,j]), 
                            x=j, 
                            y=i, 
                            showarrow=False,
                            font=dict(
                                color=color,
                                size=24,
                            ) 
                            )

  fig.show()

### Plot ROC Curve Function

In [404]:
def plot_roc(tpr, fpr):
  
  fig = px.line(x=fpr, 
                y=tpr,
                )
    
  fig.add_scatter(x=[0,1], 
                  y=[0,1],
                  line=dict(color='navy', dash='dash'),
                  name="Guessing"
                  )

  
  fig.update_layout(
      title='ROC Curve',
      xaxis_title='False Positive Rate',
      yaxis_title='True Positive Rate',
      width=700,
      height=500,
  )

  fig.show()

### Plot multiple ROC plots

In [405]:
# Function to plot multiple ROC curevs on single plot

def multi_roc_plot(models, x_train= x_train, y_train= y_train, x_test=x_test, y_test= y_test):
  fpr_list = [] 
  tpr_list = []
  legend_list = []
  
  for model in models:
    # Training the models on x_train and y_train
    model.fit(x_train, y_train)
    
    y_pred_prob = model.predict_proba(X=x_test)         # y_pred_prob.shape = [len(x-test), 2], (prob(label0), prob(label1))

    # Getting the FalsePositiveRate and TruePositveRates from plotting the ROC curve
    fpr, tpr,_ = roc_curve(
                          y_test, 
                          y_pred_prob[:,1],
                          )
    
    tpr_list.append(tpr)
    fpr_list.append(fpr)

    index = str(model).find('(')
    legend_list.append(str(model)[:index])

  colors = px.colors.qualitative.Set1

  fig = px.line()

  for i, (tpr, fpr) in enumerate(zip(tpr_list, fpr_list)):
    fig.add_scatter(
                    x=fpr,
                    y=tpr,
                    mode='lines',
                    line=dict(color= colors[i]),
                    name= legend_list[i]
                    )
    
  fig.add_scatter(x=[0,1], 
                  y=[0,1],
                  line=dict(color='navy', dash='dash'),
                  name="Guessing"
                  )
  
  fig.update_layout(
      title='ROC Curve',
      xaxis_title='False Positive Rate',
      yaxis_title='True Positive Rate',
      width=700,
      height=500,
  )

  fig.show()

### Metrics Function

In [406]:
# Performance metrics calculation function

def metrics(model, labels, xTest=None, yTest=None):

  if np.all(xTest == None):
    # Using average of cross val score for accuracy
    score = cross_val_score(model, 
                            data, 
                            labels,
                            cv=10,
                          )
    
    cross_val_acc = np.average(score)
    cross_val_std = np.std(score)

  # Training the model on x_train and y_train
  model.fit(x_train, y_train)

  if np.all(xTest == None):
    # Getting the class-label predictions and class-label prediction probabilities from the trained model
    model_predictions = model.predict(X=x_test)         # model_predictions.shape = [len(x-test)], (label(ith epoch))
    y_pred_prob = model.predict_proba(X=x_test)         # y_pred_prob.shape = [len(x-test), 2], (prob(label0), prob(label1))
  else:
    # Getting the class-label predictions and class-label prediction probabilities for the test data
    model_predictions = model.predict(X=xTest)         # model_predictions.shape = [len(x-test)], (label(ith epoch))
    y_pred_prob = model.predict_proba(X=xTest)         # y_pred_prob.shape = [len(x-test), 2], (prob(label0), prob(label1))

  if np.all(xTest == None):
    # Building the Classification Report using the predictions as a dataframe without the accuracy column
    classif_report = pd.DataFrame(classification_report(y_true=y_test, 
                                                        y_pred=model_predictions, 
                                                        output_dict=True,
                                                        zero_division=0,
                                                        )
                                  ).drop(labels="accuracy", axis=1).T.round(2)
  else:
    # Building the Classification Report using the predictions as a dataframe with the accuracy column
    classif_report = pd.DataFrame(classification_report(y_true=yTest, 
                                                        y_pred=model_predictions, 
                                                        output_dict=True,
                                                        zero_division=0,
                                                        )
                                  ).T.round(2)
  if np.all(xTest == None):  
    # Building the Confusion Matrix using the predicted class labels
    cnf_matrix = confusion_matrix(y_true=y_test, y_pred=model_predictions)
  else:
    # Building the Confusion Matrix using the predicted class labels for test-subjects
    cnf_matrix = confusion_matrix(y_true=yTest, y_pred=model_predictions)
  
  if len(np.unique(labels)) == 2:
    if np.all(xTest == None):
      # Getting the FalsePositiveRate and TruePositveRates from plotting the ROC curve
      fpr, tpr, thresholds = roc_curve(
                                      y_test, 
                                      y_pred_prob[:,1],
                                      #  pos_label=3
                                      )
    else:
      # Getting the FalsePositiveRate and TruePositveRates from plotting the ROC curve for test-subjects
      fpr, tpr, thresholds = roc_curve(
                                      yTest, 
                                      y_pred_prob[:,1],
                                      #  pos_label=3
                                      )

  if len(np.unique(labels)) == 2:
    if np.all(xTest == None):
      # Calculating the Area under the ROC curve ie, AUC using class label prediction probabilities
      auc = roc_auc_score(y_test, y_pred_prob[:,1])
    else:
      # Calculating the Area under the ROC curve ie, AUC using class label prediction probabilities for test-subjects
      auc = roc_auc_score(yTest, y_pred_prob[:,1])

  else:
    micro_roc_auc_ovr = roc_auc_score(
                                      y_test,
                                      y_pred_prob,
                                      multi_class="ovr",
                                      average="micro",
                                      )
    
    macro_roc_auc_ovr = roc_auc_score(
                                      y_test,
                                      y_pred_prob,
                                      multi_class="ovr",
                                      average="macro",
                                      )
    
    macro_roc_auc_ovo = roc_auc_score(
                                      y_test,
                                      y_pred_prob,
                                      multi_class="ovo",
                                      average="macro",
                                      )
  if np.all(xTest == None):  
    print(f"Cross-val-mean-Accuracy: {100*cross_val_acc:.2f}\n Cross-val-accuarcy-std: {cross_val_std}")
    print(f"Cross-val-Accuracy: {100*cross_val_acc:.2f} +- {cross_val_std}")

  # display(classif_report)

  # plot_cnf(cnf_matrix=cnf_matrix)

  # if len(np.unique(labels)) == 2:
  #   plot_roc(tpr=tpr,
  #           fpr=fpr,
  #           )
  
  # if len(np.unique(labels)) == 2:
  #   print(f"Area Under the ROC Curve (AUC): {100*auc:.2f}")
  
  # else:
  #   print(f"\nMicro-averaged One-vs-Rest ROC AUC score:{micro_roc_auc_ovr:.2f}")
  #   print(f"\nMacro-averaged One-vs-Rest ROC AUC score:{macro_roc_auc_ovr:.2f}")
  #   print(f"\nMacro-averaged One-vs-One ROC AUC score:{macro_roc_auc_ovo:.2f}")

### XGB classifier

In [407]:
from xgboost import XGBClassifier
xgb = XGBClassifier()

In [408]:
metrics(
        xgb, 
        labels, 
        # xTest=xTest, 
        # yTest=yTest        
        )

Cross-val-mean-Accuracy: 97.86
 Cross-val-accuarcy-std: 0.024852414008836556
Cross-val-Accuracy: 97.86 +- 0.024852414008836556


### ETC 

In [409]:
from sklearn.ensemble import ExtraTreesClassifier
etc = ExtraTreesClassifier(n_estimators=50)

In [410]:
metrics(etc, labels)

Cross-val-mean-Accuracy: 97.61
 Cross-val-accuarcy-std: 0.026639498630181717
Cross-val-Accuracy: 97.61 +- 0.026639498630181717


### RFC

In [411]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=50)

In [412]:
metrics(rfc, labels)

Cross-val-mean-Accuracy: 97.26
 Cross-val-accuarcy-std: 0.03239644500246336
Cross-val-Accuracy: 97.26 +- 0.03239644500246336


### Gaussian naive bayes

In [413]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()

In [414]:
metrics(gnb, labels)

Cross-val-mean-Accuracy: 61.55
 Cross-val-accuarcy-std: 0.2365874883875021
Cross-val-Accuracy: 61.55 +- 0.2365874883875021


### Linear Regression

In [415]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(solver='liblinear', multi_class='auto')

In [416]:
metrics(lr, labels)

Cross-val-mean-Accuracy: 62.86
 Cross-val-accuarcy-std: 0.24478633076571832
Cross-val-Accuracy: 62.86 +- 0.24478633076571832


### SVC

In [417]:
from sklearn.svm import SVC
svc = SVC(gamma='auto',
          probability=True,
          )

In [418]:
metrics(svc, labels)

Cross-val-mean-Accuracy: 58.82
 Cross-val-accuarcy-std: 0.21315294004006452
Cross-val-Accuracy: 58.82 +- 0.21315294004006452


### KNN 

In [419]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=27)

In [420]:
metrics(knn, labels)

Cross-val-mean-Accuracy: 71.70
 Cross-val-accuarcy-std: 0.2095872735690811
Cross-val-Accuracy: 71.70 +- 0.2095872735690811
