In [36]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn. metrics import confusion_matrix, ConfusionMatrixDisplay , precision_score, recall_score, f1_score, r2_score, roc_curve, roc_auc_score, classification_report

In [None]:
# feature_label_matrix = pd.read_csv('./labeled_feature_matrix.csv')
# display(feature_label_matrix.shape)

In [None]:
# feature_matrix = np.loadtxt('./labeled_feature_matrix.csv', delimiter=',')
# display(feature_matrix.shape)

In [37]:
# Run this cell when executing Preprocessing.v1
feature_matrix = np.load("../outputs-v1/labeled_feature_vector.npy")

In [None]:
# Run this cell when executing preprocessing.v2
feature_matrix = np.load("/home/rahul/Drive-D/clg/BTP-files/codes-v2/labeled_feature_v2.npy")

In [38]:
display(feature_matrix.shape)

(8640, 131)

In [39]:
labels = feature_matrix[:, -1]
display(labels.shape)

(8640,)

In [40]:
data = feature_matrix[:, :-1]
display(data.shape)
for i in range(data.shape[0]):
        for j in range(data.shape[1]):
            if np.isnan(data[i,j]):
                  display([i,j])

(8640, 130)

In [41]:
fold = StratifiedKFold(n_splits=10)

In [42]:
for train_index, test_index in fold.split(data, labels):
    x_train, x_test, y_train, y_test = [data[i] for i in train_index], [data[i] for i in test_index], [labels[i] for i in train_index], [labels[i] for i in test_index]


# x_train, x_test, y_train, y_test = train_test_split(data,labels,test_size=0.3)

In [43]:
def plot_cnf(cnf_matrix):
  fig = px.imshow(cnf_matrix, 
                 color_continuous_scale='Blues')

  fig.update_layout(
      title="Confusion Matrix with Rest-case-labeled: 0, One-back-labeled: 1",
      xaxis_title="Actual Labels",
      yaxis_title="Predicted Labels",
      width= 700,
      height=700,
  )

  fig.update_layout(
      font=dict(
          size=12
      ),
      xaxis = dict(
          tick0=0,
          dtick=1
      ),
      yaxis = dict(
          tick0=0,
          dtick=1
      )
  )

  for i in range(len(cnf_matrix)):
      for j in range(len(cnf_matrix)):
          
          if (cnf_matrix[i,j] >= 220):
              color = 'white'
          else: 
              color = 'black'

          fig.add_annotation(text=str(cnf_matrix[i,j]), 
                            x=j, 
                            y=i, 
                            showarrow=False,
                            font=dict(
                                color=color,
                                size=24,
                            ) 
                            )

  fig.show()

In [44]:
def plot_roc(tpr, fpr):
  fig = px.line(x=fpr, 
              y=tpr,
              )
  fig.add_scatter(x=[0,1], 
                  y=[0,1],
                  line=dict(color='navy', dash='dash'),
                  name="Guessing"
                  )

  fig.update_layout(
      title='ROC Curve',
      xaxis_title='False Positive Rate',
      yaxis_title='True Positive Rate',
      width=700,
      height=500,
  )

  fig.show()

In [56]:
# Performance metrics calculation function

def metrics(model):
  # Using average of cross val score for accuracy
  score = cross_val_score(model, 
                                  data, 
                                  labels,
                                  cv=10,
                                )
  
  cross_val_acc = np.average(score)

  # Training the model on x_train and y_train
  model.fit(x_train, y_train)

  # Getting the class-label predictions and class-label prediction probabilities from the trained model
  model_predictions = model.predict(X=x_test)         # model_predictions.shape = [len(x-test)], (label(ith epoch))
  y_pred_prob = model.predict_proba(X=x_test)         # y_pred_prob.shape = [len(x-test), 2], (prob(label0), prob(label1))

  # Building the Classification Report using the predictions as a dataframe without the accuracy column
  classif_report = pd.DataFrame(classification_report(y_true=y_test, 
                                                      y_pred=model_predictions, 
                                                      output_dict=True,
                                                      zero_division=0,
                                                      )
                                ).drop(labels="accuracy", axis=1).T.round(2)
  
  # Building the Confusion Matrix using the predicted class labels
  cnf_matrix = confusion_matrix(y_true=y_test, y_pred=model_predictions)
  
  # Getting the FalsePositiveRate and TruePositveRates from plotting the ROC curve
  fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob[:,1])

  # Calculating the Area under the ROC curve ie, AUC using class label prediction probabilities
  auc = roc_auc_score(y_test, y_pred_prob[:,1])

  print(f"Cross-val-mean-Accuracy: {cross_val_acc:.4f}\n")

  display(classif_report)

  plot_cnf(cnf_matrix=cnf_matrix)

  plot_roc(tpr=tpr,
           fpr=fpr,
          )
  
  print(f"Area Under the ROC Curve (AUC): {auc:.4f}")

In [46]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()

In [47]:
metrics(model=gnb)

Cross-val-mean-Accuracy: 0.4424



Unnamed: 0,precision,recall,f1-score,support
0.0,0.29,0.18,0.23,432.0
1.0,0.41,0.56,0.47,432.0
macro avg,0.35,0.37,0.35,864.0
weighted avg,0.35,0.37,0.35,864.0


Area Under the ROC Curve (AUC): 0.3092


In [48]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(solver='liblinear', multi_class='auto')

In [49]:
metrics(lr)

Cross-val-mean-Accuracy: 0.4992



Unnamed: 0,precision,recall,f1-score,support
0.0,0.6,0.49,0.54,432.0
1.0,0.57,0.68,0.62,432.0
macro avg,0.58,0.58,0.58,864.0
weighted avg,0.58,0.58,0.58,864.0


Area Under the ROC Curve (AUC): 0.5704


In [50]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=12)

In [51]:
metrics(rfc)

Cross-val-mean-Accuracy: 0.4800



Unnamed: 0,precision,recall,f1-score,support
0.0,0.42,0.58,0.49,432.0
1.0,0.34,0.22,0.27,432.0
macro avg,0.38,0.4,0.38,864.0
weighted avg,0.38,0.4,0.38,864.0


Area Under the ROC Curve (AUC): 0.3502


In [52]:
from sklearn.svm import SVC
svc = SVC(gamma='auto',
          probability=True,
          )

In [57]:
metrics(svc)

Cross-val-mean-Accuracy: 0.5003



Unnamed: 0,precision,recall,f1-score,support
0.0,0.5,1.0,0.67,432.0
1.0,0.0,0.0,0.0,432.0
macro avg,0.25,0.5,0.33,864.0
weighted avg,0.25,0.5,0.33,864.0


Area Under the ROC Curve (AUC): 0.4860


In [54]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=27)

In [55]:
metrics(knn)

Cross-val-mean-Accuracy: 0.4708



Unnamed: 0,precision,recall,f1-score,support
0.0,0.42,0.42,0.42,432.0
1.0,0.42,0.42,0.42,432.0
macro avg,0.42,0.42,0.42,864.0
weighted avg,0.42,0.42,0.42,864.0


Area Under the ROC Curve (AUC): 0.3896
