In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn. metrics import confusion_matrix, ConfusionMatrixDisplay , precision_score, recall_score, f1_score, r2_score, roc_curve, roc_auc_score, classification_report



In [None]:
# feature_label_matrix = pd.read_csv('./labeled_feature_matrix.csv')
# display(feature_label_matrix.shape)

In [None]:
# feature_matrix = np.loadtxt('./labeled_feature_matrix.csv', delimiter=',')
# display(feature_matrix.shape)

In [37]:
# Run this cell when executing Preprocessing.v1
feature_matrix = np.load("../outputs-v1/labeled_feature_vector.npy")

In [46]:
# Run this cell when executing preprocessing.v2 with filtering
feature_matrix = np.load("../codes-v2/labeled_feature_v2.npy")

In [125]:
# Run this cell when doing feature extraction specific to features
feature_matrix = np.load("../codes-v2/labeled_feature_rest-2-all.npy")

In [32]:
# Run this cell when executing preprocessing.v2 with four class
feature_matrix = np.load("../codes-v2/labeled_feature_v3.npy")

In [115]:
display(feature_matrix.shape)

(8391, 128)

In [126]:
labels = feature_matrix[:, -1]
display(labels.shape)

(8391,)

In [104]:
print(len(np.unique(labels)))

2


In [127]:
data = feature_matrix[:, :-1]
display(data.shape)
for i in range(data.shape[0]):
        for j in range(data.shape[1]):
            if np.isnan(data[i,j]):
                  display([i,j])

(8391, 130)

In [18]:
fold = StratifiedKFold(n_splits=10)

In [128]:
for train_index, test_index in fold.split(data, labels):
    x_train, x_test, y_train, y_test = [data[i] for i in train_index], [data[i] for i in test_index], [labels[i] for i in train_index], [labels[i] for i in test_index]


# x_train, x_test, y_train, y_test = train_test_split(data,labels,test_size=0.3)

In [52]:
print(len(np.unique(y_test)))

2


In [20]:
def plot_cnf(cnf_matrix):
  fig = px.imshow(cnf_matrix, 
                 color_continuous_scale='Blues')

  fig.update_layout(
      title="Confusion Matrix with Rest-case-labeled: 0, One-back-labeled: 1",
      xaxis_title="Actual Labels",
      yaxis_title="Predicted Labels",
      width= 700,
      height=700,
  )

  fig.update_layout(
      font=dict(
          size=12
      ),
      xaxis = dict(
          tick0=0,
          dtick=1
      ),
      yaxis = dict(
          tick0=0,
          dtick=1
      )
  )

  for i in range(len(cnf_matrix)):
      for j in range(len(cnf_matrix)):
          
          if (cnf_matrix[i,j] >= 220):
              color = 'white'
          else: 
              color = 'black'

          fig.add_annotation(text=str(cnf_matrix[i,j]), 
                            x=j, 
                            y=i, 
                            showarrow=False,
                            font=dict(
                                color=color,
                                size=24,
                            ) 
                            )

  fig.show()

In [21]:
def plot_roc(tpr, fpr):
  fig = px.line(x=fpr, 
              y=tpr,
              )
  fig.add_scatter(x=[0,1], 
                  y=[0,1],
                  line=dict(color='navy', dash='dash'),
                  name="Guessing"
                  )

  fig.update_layout(
      title='ROC Curve',
      xaxis_title='False Positive Rate',
      yaxis_title='True Positive Rate',
      width=700,
      height=500,
  )

  fig.show()

In [161]:
# Performance metrics calculation function

def metrics(model, labels = labels):
  # Using average of cross val score for accuracy
  score = cross_val_score(model, 
                                  data, 
                                  labels,
                                  cv=10,
                                )
  
  cross_val_acc = np.average(score)

  # Training the model on x_train and y_train
  model.fit(x_train, y_train)

  # Getting the class-label predictions and class-label prediction probabilities from the trained model
  model_predictions = model.predict(X=x_test)         # model_predictions.shape = [len(x-test)], (label(ith epoch))
  y_pred_prob = model.predict_proba(X=x_test)         # y_pred_prob.shape = [len(x-test), 2], (prob(label0), prob(label1))

  # Building the Classification Report using the predictions as a dataframe without the accuracy column
  classif_report = pd.DataFrame(classification_report(y_true=y_test, 
                                                      y_pred=model_predictions, 
                                                      output_dict=True,
                                                      zero_division=0,
                                                      )
                                ).drop(labels="accuracy", axis=1).T.round(2)
  
  # Building the Confusion Matrix using the predicted class labels
  cnf_matrix = confusion_matrix(y_true=y_test, y_pred=model_predictions)
  
  if len(np.unique(labels)) == 2:
    # Getting the FalsePositiveRate and TruePositveRates from plotting the ROC curve
    fpr, tpr, thresholds = roc_curve(
                                     y_test, 
                                     y_pred_prob[:,1],
                                    #  pos_label=3
                                    )

  if len(np.unique(labels)) == 2:
    # Calculating the Area under the ROC curve ie, AUC using class label prediction probabilities
    auc = roc_auc_score(y_test, y_pred_prob[:,1])

  else:
    micro_roc_auc_ovr = roc_auc_score(
                                      y_test,
                                      y_pred_prob,
                                      multi_class="ovr",
                                      average="micro",
                                      )
    
    macro_roc_auc_ovr = roc_auc_score(
                                      y_test,
                                      y_pred_prob,
                                      multi_class="ovr",
                                      average="macro",
                                      )
    
    macro_roc_auc_ovo = roc_auc_score(
                                      y_test,
                                      y_pred_prob,
                                      multi_class="ovo",
                                      average="macro",
                                      )
    
  print(f"Cross-val-mean-Accuracy: {100*cross_val_acc:.2f}\n")

  display(classif_report)

  plot_cnf(cnf_matrix=cnf_matrix)

  if len(np.unique(labels)) == 2:
    plot_roc(tpr=tpr,
            fpr=fpr,
            )
  
  if len(np.unique(labels)) == 2:
    print(f"Area Under the ROC Curve (AUC): {100*auc:.2f}")
  
  else:
    print(f"\nMicro-averaged One-vs-Rest ROC AUC score:{micro_roc_auc_ovr:.2f}")
    print(f"\nMacro-averaged One-vs-Rest ROC AUC score:{macro_roc_auc_ovr:.2f}")
    print(f"\nMacro-averaged One-vs-One ROC AUC score:{macro_roc_auc_ovo:.2f}")

In [41]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()

In [42]:
metrics(model=gnb)

Cross-val-mean-Accuracy: 0.6226



Unnamed: 0,precision,recall,f1-score,support
0.0,0.49,0.97,0.65,417.0
2.0,0.35,0.02,0.03,422.0
macro avg,0.42,0.49,0.34,839.0
weighted avg,0.42,0.49,0.34,839.0


Area Under the ROC Curve (AUC): 0.4391


In [18]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(solver='liblinear', multi_class='auto')

In [19]:
metrics(lr)

Cross-val-mean-Accuracy: 0.5026



Unnamed: 0,precision,recall,f1-score,support
0.0,0.0,0.0,0.0,417.0
3.0,0.5,1.0,0.67,422.0
macro avg,0.25,0.5,0.33,839.0
weighted avg,0.25,0.5,0.34,839.0


Area Under the ROC Curve (AUC): 0.5000


In [158]:
from xgboost import XGBClassifier
xgb = XGBClassifier()

In [180]:
feature_matrix = np.load("../codes-v2/labeled_feature_rest-2-all.npy")
display(feature_matrix.shape)

labels = feature_matrix[:, -1]
display(labels.shape)

data = feature_matrix[:, :-1]
display(data.shape)

for i in range(data.shape[0]):
        for j in range(data.shape[1]):
            if np.isnan(data[i,j]):
                  display([i,j])

for train_index, test_index in fold.split(data, labels):
    x_train, x_test, y_train, y_test = [data[i] for i in train_index], [data[i] for i in test_index], [labels[i] for i in train_index], [labels[i] for i in test_index]


(8391, 131)

(8391,)

(8391, 130)

In [181]:
metrics(xgb)

Cross-val-mean-Accuracy: 81.08



Unnamed: 0,precision,recall,f1-score,support
0.0,0.96,0.94,0.95,417.0
1.0,0.94,0.96,0.95,422.0
macro avg,0.95,0.95,0.95,839.0
weighted avg,0.95,0.95,0.95,839.0


Area Under the ROC Curve (AUC): 98.72


In [135]:
from sklearn.ensemble import ExtraTreesClassifier
etc = ExtraTreesClassifier(n_estimators=50)

In [157]:
metrics(etc)

Cross-val-mean-Accuracy: 0.7998



Unnamed: 0,precision,recall,f1-score,support
0.0,0.91,0.87,0.89,417.0
1.0,0.88,0.91,0.89,422.0
macro avg,0.89,0.89,0.89,839.0
weighted avg,0.89,0.89,0.89,839.0


Area Under the ROC Curve (AUC): 0.9622


In [134]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=50)

In [133]:
metrics(rfc)

Cross-val-mean-Accuracy: 0.8173



Unnamed: 0,precision,recall,f1-score,support
0.0,0.95,0.88,0.92,417.0
1.0,0.89,0.95,0.92,422.0
macro avg,0.92,0.92,0.92,839.0
weighted avg,0.92,0.92,0.92,839.0


Area Under the ROC Curve (AUC): 0.9780


In [22]:
from sklearn.svm import SVC
svc = SVC(gamma='auto',
          probability=True,
          )

In [23]:
metrics(svc)

Cross-val-mean-Accuracy: 0.5026



Unnamed: 0,precision,recall,f1-score,support
0.0,0.0,0.0,0.0,417.0
3.0,0.5,1.0,0.67,422.0
macro avg,0.25,0.5,0.33,839.0
weighted avg,0.25,0.5,0.34,839.0


Area Under the ROC Curve (AUC): 0.5000


In [26]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=27)

In [27]:
metrics(knn)

Cross-val-mean-Accuracy: 0.2589



Unnamed: 0,precision,recall,f1-score,support
0.0,0.4,0.37,0.38,417.0
1.0,0.26,0.26,0.26,422.0
2.0,0.24,0.32,0.27,422.0
3.0,0.28,0.22,0.25,421.0
macro avg,0.3,0.29,0.29,1682.0
weighted avg,0.3,0.29,0.29,1682.0



Micro-averaged One-vs-Rest ROC AUC score:0.57

Macro-averaged One-vs-Rest ROC AUC score:0.57

Macro-averaged One-vs-One ROC AUC score:0.57
