# Data Mining Final Project
##           - Rajendra Prasad Patil

### Glossary:
* Import libraries
* Load dataset
* Analysis on dataset
* Splitting the dataset into labels and features
* Performing normalization on dataset
* Splitting dataset using K fold 
* Running the model
    * SVM Model
    * K Nearest Neighbors
    * Random Forest Classifier
* Output Performance Metrics



### Importing libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings('ignore')

# SVM classifier
from sklearn import svm

# KNN classifier
from sklearn.neighbors import KNeighborsClassifier 

#Import Random Forest Model
from sklearn.ensemble import RandomForestClassifier

# Import libraries for lstm classification
from keras.layers import Dense, Dropout, LSTM, Embedding
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential

# for checking the model accuracy
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

### Loading the dataset

In [2]:
dataset = load_breast_cancer()

In [3]:
input_length = len(dataset['data'][0])

### Preliminary analysis

In [4]:
class_names = dataset['target_names']
print('Target variables  : ', class_names)

(unique, counts) = np.unique(dataset['target'], return_counts=True)

print('Unique values of the target variable', unique)
print('Counts of the target variable :', counts)

Target variables  :  ['malignant' 'benign']
Unique values of the target variable [0 1]
Counts of the target variable : [212 357]


* The dataset is suited for binary classification
* The dataset has no skewed nature

### The data is split into features and labels

In [5]:
X = dataset['data']
y = dataset['target']

### Apply normalization operation for numerical stability

In [6]:
standardizer = StandardScaler()
X = standardizer.fit_transform(X)

# Performance Metrics
Function to calculate all the available performance metrics

In [7]:
performance_metrics = ['True Negative', 'False Positive', 'False Negative', 'True Positivity', 'Sensitivity', 'Specificity', 
                       'Precision', 'Accuracy', 'F1 Score', 'Error Rate', 'Negative Predicted Value', 'False Positve Rate', 
                       'False Discovery Rate', 'False Negative Rate', 'Balanced Accuracy', 'True Skill Statistics', 
                       'Heidke Skill Score']

def compute_performance_metrics(prediction, y_test, df, is_lstm = False):
    
    if is_lstm:
        threshold = 0.80
        for i, each in enumerate(prediction):
            if each[0] > threshold:
                prediction[i] = 1
            else:
                prediction[i] = 0
    
    TN, FP, FN, TP = confusion_matrix(y_test, prediction).ravel()
    
    sensitivity = TP / (TP + FN)
    specificity = TN / (FP + TN)
    precision = TP / (TP + FP)
    accuracy =  (TP+TN) /(TP+FP+TN+FN)
    f1_score = 2 * TP / ((2 * TP) + FP + FN)
    error_rate = (FP + FN) / (TP + FP + FN + TN)
    negative_predicted_value = TN / (TN + FN)
    false_positive_rate = FP / (FP + TN)
    false_discovery_rate = FP / (FP + TP)
    false_negative_rate = FN / (FN + TP)
    balanced_accuracy = 0.5 * ((TP / (TP + FN)) + (TN / (TN + FP)))
    true_skill_statistics = ((TP / (TP + FN)) - (FP / (TN + FP)))
    heidke_skill_score = 2 * ((TP * TN) - (FP * FN)) / (((𝑇𝑃 + 𝐹𝑁) * (𝐹𝑁 + 𝑇𝑁)) +((TP+FP) * (𝐹𝑃 + 𝑇𝑁)))
    
    df = df.append({performance_metrics[0]: TN, performance_metrics[1]: FP, performance_metrics[2]: FN, 
                    performance_metrics[3]: TP, performance_metrics[4]: sensitivity, performance_metrics[5]: specificity, 
                    performance_metrics[6]: precision, performance_metrics[7]: accuracy, performance_metrics[8]: f1_score, 
                    performance_metrics[9]: error_rate, performance_metrics[10]: negative_predicted_value, 
                    performance_metrics[11]: false_positive_rate, performance_metrics[12]: false_discovery_rate, 
                    performance_metrics[13]: false_negative_rate, performance_metrics[14]: 
                    balanced_accuracy, performance_metrics[15]: true_skill_statistics,
                    performance_metrics[16]: heidke_skill_score}, ignore_index=True)
    return df    

# K-fold cross validation

In [8]:
from sklearn.model_selection import KFold
kfold = KFold(n_splits=10, shuffle=True, random_state=0)

### Dataframes for performance metrics

In [9]:
svm_metrics_df = pd.DataFrame(columns=performance_metrics)
kn_metrics_df = pd.DataFrame(columns=performance_metrics)
rf_metrics_df = pd.DataFrame(columns=performance_metrics)
lstm_metrics_df = pd.DataFrame(columns=performance_metrics)

## SVM Model

In [10]:
svm_model = svm.SVC()
for train_index, test_index in kfold.split(X):
    X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[train_index], y[test_index]

    # we train the algorithm with training data and training output
    svm_model.fit(X_train, y_train)

    # we pass the testing data to the stored algorithm to predict the outcome
    prediction = svm_model.predict(X_test)

    # print metrics
    svm_metrics_df = compute_performance_metrics(prediction, y_test, svm_metrics_df)

svm_metrics_df.index += 1
svm_metrics_df.loc['Average'] = svm_metrics_df.mean()

In [11]:
svm_metrics_df

Unnamed: 0,True Negative,False Positive,False Negative,True Positivity,Sensitivity,Specificity,Precision,Accuracy,F1 Score,Error Rate,Negative Predicted Value,False Positve Rate,False Discovery Rate,False Negative Rate,Balanced Accuracy,True Skill Statistics,Heidke Skill Score
1,22.0,0.0,0.0,35.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0
2,23.0,2.0,1.0,31.0,0.96875,0.92,0.939394,0.947368,0.953846,0.052632,0.958333,0.08,0.060606,0.03125,0.944375,0.88875,0.892655
3,15.0,1.0,1.0,40.0,0.97561,0.9375,0.97561,0.964912,0.97561,0.035088,0.9375,0.0625,0.02439,0.02439,0.956555,0.91311,0.91311
4,20.0,0.0,0.0,37.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0
5,17.0,1.0,0.0,39.0,1.0,0.944444,0.975,0.982456,0.987342,0.017544,1.0,0.055556,0.025,0.0,0.972222,0.944444,0.958785
6,19.0,3.0,0.0,35.0,1.0,0.863636,0.921053,0.947368,0.958904,0.052632,1.0,0.136364,0.078947,0.0,0.931818,0.863636,0.886076
7,22.0,1.0,1.0,33.0,0.970588,0.956522,0.970588,0.964912,0.970588,0.035088,0.956522,0.043478,0.029412,0.029412,0.963555,0.92711,0.92711
8,23.0,0.0,2.0,32.0,0.941176,1.0,1.0,0.964912,0.969697,0.035088,0.92,0.0,0.0,0.058824,0.970588,0.941176,0.928121
9,18.0,0.0,0.0,39.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0
10,25.0,0.0,0.0,31.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0


### K-Nearest Neighbors

In [12]:
model = KNeighborsClassifier(n_neighbors=3) # this examines 3 neighbors for putting the data into class

for train_index, test_index in kfold.split(X):
    X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[train_index], y[test_index]

    # we train the algorithm with training data and training output
    model.fit(X_train, y_train)

    # we pass the testing data to the stored algorithm to predict the outcome
    prediction = model.predict(X_test)

    # print metrics
    kn_metrics_df = compute_performance_metrics(prediction, y_test, kn_metrics_df)

kn_metrics_df.index += 1
kn_metrics_df.loc['Average'] = kn_metrics_df.mean()

In [13]:
kn_metrics_df

Unnamed: 0,True Negative,False Positive,False Negative,True Positivity,Sensitivity,Specificity,Precision,Accuracy,F1 Score,Error Rate,Negative Predicted Value,False Positve Rate,False Discovery Rate,False Negative Rate,Balanced Accuracy,True Skill Statistics,Heidke Skill Score
1,21.0,1.0,0.0,35.0,1.0,0.954545,0.972222,0.982456,0.985915,0.017544,1.0,0.045455,0.027778,0.0,0.977273,0.954545,0.962672
2,21.0,4.0,0.0,32.0,1.0,0.84,0.888889,0.929825,0.941176,0.070175,1.0,0.16,0.111111,0.0,0.92,0.84,0.854962
3,14.0,2.0,1.0,40.0,0.97561,0.875,0.952381,0.947368,0.963855,0.052632,0.933333,0.125,0.047619,0.02439,0.925305,0.85061,0.867133
4,19.0,1.0,0.0,37.0,1.0,0.95,0.973684,0.982456,0.986667,0.017544,1.0,0.05,0.026316,0.0,0.975,0.95,0.961039
5,17.0,1.0,1.0,38.0,0.974359,0.944444,0.974359,0.964912,0.974359,0.035088,0.944444,0.055556,0.025641,0.025641,0.959402,0.918803,0.918803
6,19.0,3.0,0.0,35.0,1.0,0.863636,0.921053,0.947368,0.958904,0.052632,1.0,0.136364,0.078947,0.0,0.931818,0.863636,0.886076
7,22.0,1.0,0.0,34.0,1.0,0.956522,0.971429,0.982456,0.985507,0.017544,1.0,0.043478,0.028571,0.0,0.978261,0.956522,0.963297
8,23.0,0.0,0.0,34.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0
9,18.0,0.0,0.0,39.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0
10,23.0,2.0,0.0,31.0,1.0,0.92,0.939394,0.964286,0.96875,0.035714,1.0,0.08,0.060606,0.0,0.96,0.92,0.927178


### Random Forest Classifier

In [14]:
#Create a Gaussian Classifier
model = RandomForestClassifier(n_estimators=100)

for train_index, test_index in kfold.split(X):
    X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[train_index], y[test_index]

    # we train the algorithm with training data and training output
    model.fit(X_train, y_train)

    # we pass the testing data to the stored algorithm to predict the outcome
    prediction = model.predict(X_test)

    # print metrics
    rf_metrics_df = compute_performance_metrics(prediction, y_test, rf_metrics_df)

rf_metrics_df.index += 1
rf_metrics_df.loc['Average'] = rf_metrics_df.mean()

In [15]:
rf_metrics_df

Unnamed: 0,True Negative,False Positive,False Negative,True Positivity,Sensitivity,Specificity,Precision,Accuracy,F1 Score,Error Rate,Negative Predicted Value,False Positve Rate,False Discovery Rate,False Negative Rate,Balanced Accuracy,True Skill Statistics,Heidke Skill Score
1,21.0,1.0,2.0,33.0,0.942857,0.954545,0.970588,0.947368,0.956522,0.052632,0.913043,0.045455,0.029412,0.057143,0.948701,0.897403,0.889891
2,24.0,1.0,0.0,32.0,1.0,0.96,0.969697,0.982456,0.984615,0.017544,1.0,0.04,0.030303,0.0,0.98,0.96,0.964218
3,15.0,1.0,2.0,39.0,0.95122,0.9375,0.975,0.947368,0.962963,0.052632,0.882353,0.0625,0.025,0.04878,0.94436,0.88872,0.872102
4,19.0,1.0,0.0,37.0,1.0,0.95,0.973684,0.982456,0.986667,0.017544,1.0,0.05,0.026316,0.0,0.975,0.95,0.961039
5,17.0,1.0,1.0,38.0,0.974359,0.944444,0.974359,0.964912,0.974359,0.035088,0.944444,0.055556,0.025641,0.025641,0.959402,0.918803,0.918803
6,17.0,5.0,2.0,33.0,0.942857,0.772727,0.868421,0.877193,0.90411,0.122807,0.894737,0.227273,0.131579,0.057143,0.857792,0.715584,0.734177
7,22.0,1.0,0.0,34.0,1.0,0.956522,0.971429,0.982456,0.985507,0.017544,1.0,0.043478,0.028571,0.0,0.978261,0.956522,0.963297
8,20.0,3.0,1.0,33.0,0.970588,0.869565,0.916667,0.929825,0.942857,0.070175,0.952381,0.130435,0.083333,0.029412,0.920077,0.840153,0.85214
9,18.0,0.0,0.0,39.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0
10,24.0,1.0,0.0,31.0,1.0,0.96,0.96875,0.982143,0.984127,0.017857,1.0,0.04,0.03125,0.0,0.98,0.96,0.963731


# LSTM classifier

In [16]:
model = Sequential()
model.add(LSTM(20, input_shape=(input_length, 1)))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

for train_index, test_index in kfold.split(X):
    print('*'*100)
    X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[train_index], y[test_index]

    # we train the algorithm with training data and training output
    model.fit(X_train, y_train, batch_size=8, epochs=10, validation_data=(X_test, y_test), verbose = 1)

    # we pass the testing data to the stored algorithm to predict the outcome
    prediction = model.predict(X_test)
    
    # print metrics
    lstm_metrics_df = compute_performance_metrics(prediction, y_test, lstm_metrics_df, is_lstm=True)

lstm_metrics_df.index += 1
lstm_metrics_df.loc['Average'] = lstm_metrics_df.mean()

Metal device set to: Apple M1
****************************************************************************************************


2021-12-04 16:56:43.997537: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2021-12-04 16:56:43.997965: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
2021-12-04 16:56:44.209424: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 1/10


2021-12-04 16:56:44.590973: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2021-12-04 16:56:44.711991: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


 5/64 [=>............................] - ETA: 0s - loss: 0.8705 - accuracy: 0.2250     

2021-12-04 16:56:44.831839: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.




2021-12-04 16:56:45.915998: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2021-12-04 16:56:45.966215: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
****************************************************************************************************
Epoch 1/10
 5/64 [=>............................] - ETA: 0s - loss: 0.3474 - accuracy: 0.8750

2021-12-04 16:56:54.706389: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2021-12-04 16:56:54.742269: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
****************************************************************************************************
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
****************************************************************************************************
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
****************************************************************************************************
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
****************************************************************************************************
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
****************************************************

2021-12-04 16:58:11.170527: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2021-12-04 16:58:11.313024: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


 7/65 [==>...........................] - ETA: 1s - loss: 0.0560 - accuracy: 1.0000

2021-12-04 16:58:11.390411: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Epoch 2/10
 1/65 [..............................] - ETA: 1s - loss: 0.1620 - accuracy: 1.0000

2021-12-04 16:58:12.789349: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2021-12-04 16:58:12.837932: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [17]:
lstm_metrics_df

Unnamed: 0,True Negative,False Positive,False Negative,True Positivity,Sensitivity,Specificity,Precision,Accuracy,F1 Score,Error Rate,Negative Predicted Value,False Positve Rate,False Discovery Rate,False Negative Rate,Balanced Accuracy,True Skill Statistics,Heidke Skill Score
1,17.0,5.0,0.0,35.0,1.0,0.772727,0.875,0.912281,0.933333,0.087719,1.0,0.227273,0.125,0.0,0.886364,0.772727,0.80678
2,23.0,2.0,6.0,26.0,0.8125,0.92,0.928571,0.859649,0.866667,0.140351,0.793103,0.08,0.071429,0.1875,0.86625,0.7325,0.719902
3,15.0,1.0,7.0,34.0,0.829268,0.9375,0.971429,0.859649,0.894737,0.140351,0.681818,0.0625,0.028571,0.170732,0.883384,0.766768,0.688098
4,18.0,2.0,1.0,36.0,0.972973,0.9,0.947368,0.947368,0.96,0.052632,0.947368,0.1,0.052632,0.027027,0.936486,0.872973,0.883117
5,17.0,1.0,6.0,33.0,0.846154,0.944444,0.970588,0.877193,0.90411,0.122807,0.73913,0.055556,0.029412,0.153846,0.895299,0.790598,0.735586
6,21.0,1.0,11.0,24.0,0.685714,0.954545,0.96,0.789474,0.8,0.210526,0.65625,0.045455,0.04,0.314286,0.82013,0.64026,0.590419
7,21.0,2.0,3.0,31.0,0.911765,0.913043,0.939394,0.912281,0.925373,0.087719,0.875,0.086957,0.060606,0.088235,0.912404,0.824808,0.819048
8,22.0,1.0,5.0,29.0,0.852941,0.956522,0.966667,0.894737,0.90625,0.105263,0.814815,0.043478,0.033333,0.147059,0.904731,0.809463,0.787313
9,17.0,1.0,3.0,36.0,0.923077,0.944444,0.972973,0.929825,0.947368,0.070175,0.85,0.055556,0.027027,0.076923,0.933761,0.867521,0.842324
10,25.0,0.0,5.0,26.0,0.83871,1.0,1.0,0.910714,0.912281,0.089286,0.833333,0.0,0.0,0.16129,0.919355,0.83871,0.822785


### Cumulative metrics

In [29]:
all_dfs = [svm_metrics_df, kn_metrics_df, lstm_metrics_df]
all_names = ['SVM', 'KNN', 'LSTM']

#### 1st Fold

In [34]:
# lstm_metrics_df.loc[1,:]
df = pd.DataFrame(columns=performance_metrics)
fold_count = 1
for i, each_df in enumerate(all_dfs):
    temp_df = each_df.xs(fold_count)
    temp_df.name = all_names[i]
    df = df.append(temp_df)
df

Unnamed: 0,True Negative,False Positive,False Negative,True Positivity,Sensitivity,Specificity,Precision,Accuracy,F1 Score,Error Rate,Negative Predicted Value,False Positve Rate,False Discovery Rate,False Negative Rate,Balanced Accuracy,True Skill Statistics,Heidke Skill Score
SVM,22.0,0.0,0.0,35.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0
KNN,21.0,1.0,0.0,35.0,1.0,0.954545,0.972222,0.982456,0.985915,0.017544,1.0,0.045455,0.027778,0.0,0.977273,0.954545,0.962672
LSTM,17.0,5.0,0.0,35.0,1.0,0.772727,0.875,0.912281,0.933333,0.087719,1.0,0.227273,0.125,0.0,0.886364,0.772727,0.80678


#### 2nd Fold

In [35]:
# lstm_metrics_df.loc[1,:]
df = pd.DataFrame(columns=performance_metrics)
fold_count = 2
for i, each_df in enumerate(all_dfs):
    temp_df = each_df.xs(fold_count)
    temp_df.name = all_names[i]
    df = df.append(temp_df)
df

Unnamed: 0,True Negative,False Positive,False Negative,True Positivity,Sensitivity,Specificity,Precision,Accuracy,F1 Score,Error Rate,Negative Predicted Value,False Positve Rate,False Discovery Rate,False Negative Rate,Balanced Accuracy,True Skill Statistics,Heidke Skill Score
SVM,23.0,2.0,1.0,31.0,0.96875,0.92,0.939394,0.947368,0.953846,0.052632,0.958333,0.08,0.060606,0.03125,0.944375,0.88875,0.892655
KNN,21.0,4.0,0.0,32.0,1.0,0.84,0.888889,0.929825,0.941176,0.070175,1.0,0.16,0.111111,0.0,0.92,0.84,0.854962
LSTM,23.0,2.0,6.0,26.0,0.8125,0.92,0.928571,0.859649,0.866667,0.140351,0.793103,0.08,0.071429,0.1875,0.86625,0.7325,0.719902


#### 3rd Fold

In [36]:
# lstm_metrics_df.loc[1,:]
df = pd.DataFrame(columns=performance_metrics)
fold_count = 3
for i, each_df in enumerate(all_dfs):
    temp_df = each_df.xs(fold_count)
    temp_df.name = all_names[i]
    df = df.append(temp_df)
df

Unnamed: 0,True Negative,False Positive,False Negative,True Positivity,Sensitivity,Specificity,Precision,Accuracy,F1 Score,Error Rate,Negative Predicted Value,False Positve Rate,False Discovery Rate,False Negative Rate,Balanced Accuracy,True Skill Statistics,Heidke Skill Score
SVM,15.0,1.0,1.0,40.0,0.97561,0.9375,0.97561,0.964912,0.97561,0.035088,0.9375,0.0625,0.02439,0.02439,0.956555,0.91311,0.91311
KNN,14.0,2.0,1.0,40.0,0.97561,0.875,0.952381,0.947368,0.963855,0.052632,0.933333,0.125,0.047619,0.02439,0.925305,0.85061,0.867133
LSTM,15.0,1.0,7.0,34.0,0.829268,0.9375,0.971429,0.859649,0.894737,0.140351,0.681818,0.0625,0.028571,0.170732,0.883384,0.766768,0.688098


#### 4th Fold

In [37]:
# lstm_metrics_df.loc[1,:]
df = pd.DataFrame(columns=performance_metrics)
fold_count = 4
for i, each_df in enumerate(all_dfs):
    temp_df = each_df.xs(fold_count)
    temp_df.name = all_names[i]
    df = df.append(temp_df)
df

Unnamed: 0,True Negative,False Positive,False Negative,True Positivity,Sensitivity,Specificity,Precision,Accuracy,F1 Score,Error Rate,Negative Predicted Value,False Positve Rate,False Discovery Rate,False Negative Rate,Balanced Accuracy,True Skill Statistics,Heidke Skill Score
SVM,20.0,0.0,0.0,37.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0
KNN,19.0,1.0,0.0,37.0,1.0,0.95,0.973684,0.982456,0.986667,0.017544,1.0,0.05,0.026316,0.0,0.975,0.95,0.961039
LSTM,18.0,2.0,1.0,36.0,0.972973,0.9,0.947368,0.947368,0.96,0.052632,0.947368,0.1,0.052632,0.027027,0.936486,0.872973,0.883117


#### 5th Fold

In [38]:
# lstm_metrics_df.loc[1,:]
df = pd.DataFrame(columns=performance_metrics)
fold_count = 5
for i, each_df in enumerate(all_dfs):
    temp_df = each_df.xs(fold_count)
    temp_df.name = all_names[i]
    df = df.append(temp_df)
df

Unnamed: 0,True Negative,False Positive,False Negative,True Positivity,Sensitivity,Specificity,Precision,Accuracy,F1 Score,Error Rate,Negative Predicted Value,False Positve Rate,False Discovery Rate,False Negative Rate,Balanced Accuracy,True Skill Statistics,Heidke Skill Score
SVM,17.0,1.0,0.0,39.0,1.0,0.944444,0.975,0.982456,0.987342,0.017544,1.0,0.055556,0.025,0.0,0.972222,0.944444,0.958785
KNN,17.0,1.0,1.0,38.0,0.974359,0.944444,0.974359,0.964912,0.974359,0.035088,0.944444,0.055556,0.025641,0.025641,0.959402,0.918803,0.918803
LSTM,17.0,1.0,6.0,33.0,0.846154,0.944444,0.970588,0.877193,0.90411,0.122807,0.73913,0.055556,0.029412,0.153846,0.895299,0.790598,0.735586


#### 6th Fold

In [39]:
# lstm_metrics_df.loc[1,:]
df = pd.DataFrame(columns=performance_metrics)
fold_count = 6
for i, each_df in enumerate(all_dfs):
    temp_df = each_df.xs(fold_count)
    temp_df.name = all_names[i]
    df = df.append(temp_df)
df

Unnamed: 0,True Negative,False Positive,False Negative,True Positivity,Sensitivity,Specificity,Precision,Accuracy,F1 Score,Error Rate,Negative Predicted Value,False Positve Rate,False Discovery Rate,False Negative Rate,Balanced Accuracy,True Skill Statistics,Heidke Skill Score
SVM,19.0,3.0,0.0,35.0,1.0,0.863636,0.921053,0.947368,0.958904,0.052632,1.0,0.136364,0.078947,0.0,0.931818,0.863636,0.886076
KNN,19.0,3.0,0.0,35.0,1.0,0.863636,0.921053,0.947368,0.958904,0.052632,1.0,0.136364,0.078947,0.0,0.931818,0.863636,0.886076
LSTM,21.0,1.0,11.0,24.0,0.685714,0.954545,0.96,0.789474,0.8,0.210526,0.65625,0.045455,0.04,0.314286,0.82013,0.64026,0.590419


#### 7th Fold

In [40]:
# lstm_metrics_df.loc[1,:]
df = pd.DataFrame(columns=performance_metrics)
fold_count = 7
for i, each_df in enumerate(all_dfs):
    temp_df = each_df.xs(fold_count)
    temp_df.name = all_names[i]
    df = df.append(temp_df)
df

Unnamed: 0,True Negative,False Positive,False Negative,True Positivity,Sensitivity,Specificity,Precision,Accuracy,F1 Score,Error Rate,Negative Predicted Value,False Positve Rate,False Discovery Rate,False Negative Rate,Balanced Accuracy,True Skill Statistics,Heidke Skill Score
SVM,22.0,1.0,1.0,33.0,0.970588,0.956522,0.970588,0.964912,0.970588,0.035088,0.956522,0.043478,0.029412,0.029412,0.963555,0.92711,0.92711
KNN,22.0,1.0,0.0,34.0,1.0,0.956522,0.971429,0.982456,0.985507,0.017544,1.0,0.043478,0.028571,0.0,0.978261,0.956522,0.963297
LSTM,21.0,2.0,3.0,31.0,0.911765,0.913043,0.939394,0.912281,0.925373,0.087719,0.875,0.086957,0.060606,0.088235,0.912404,0.824808,0.819048


#### 8th Fold

In [42]:
# lstm_metrics_df.loc[1,:]
df = pd.DataFrame(columns=performance_metrics)
fold_count = 8
for i, each_df in enumerate(all_dfs):
    temp_df = each_df.xs(fold_count)
    temp_df.name = all_names[i]
    df = df.append(temp_df)
df

Unnamed: 0,True Negative,False Positive,False Negative,True Positivity,Sensitivity,Specificity,Precision,Accuracy,F1 Score,Error Rate,Negative Predicted Value,False Positve Rate,False Discovery Rate,False Negative Rate,Balanced Accuracy,True Skill Statistics,Heidke Skill Score
SVM,23.0,0.0,2.0,32.0,0.941176,1.0,1.0,0.964912,0.969697,0.035088,0.92,0.0,0.0,0.058824,0.970588,0.941176,0.928121
KNN,23.0,0.0,0.0,34.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0
LSTM,22.0,1.0,5.0,29.0,0.852941,0.956522,0.966667,0.894737,0.90625,0.105263,0.814815,0.043478,0.033333,0.147059,0.904731,0.809463,0.787313


#### 9th Fold

In [43]:
# lstm_metrics_df.loc[1,:]
df = pd.DataFrame(columns=performance_metrics)
fold_count = 9
for i, each_df in enumerate(all_dfs):
    temp_df = each_df.xs(fold_count)
    temp_df.name = all_names[i]
    df = df.append(temp_df)
df

Unnamed: 0,True Negative,False Positive,False Negative,True Positivity,Sensitivity,Specificity,Precision,Accuracy,F1 Score,Error Rate,Negative Predicted Value,False Positve Rate,False Discovery Rate,False Negative Rate,Balanced Accuracy,True Skill Statistics,Heidke Skill Score
SVM,18.0,0.0,0.0,39.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0
KNN,18.0,0.0,0.0,39.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0
LSTM,17.0,1.0,3.0,36.0,0.923077,0.944444,0.972973,0.929825,0.947368,0.070175,0.85,0.055556,0.027027,0.076923,0.933761,0.867521,0.842324


#### 10th Fold

In [44]:
# lstm_metrics_df.loc[1,:]
df = pd.DataFrame(columns=performance_metrics)
fold_count = 10
for i, each_df in enumerate(all_dfs):
    temp_df = each_df.xs(fold_count)
    temp_df.name = all_names[i]
    df = df.append(temp_df)
df

Unnamed: 0,True Negative,False Positive,False Negative,True Positivity,Sensitivity,Specificity,Precision,Accuracy,F1 Score,Error Rate,Negative Predicted Value,False Positve Rate,False Discovery Rate,False Negative Rate,Balanced Accuracy,True Skill Statistics,Heidke Skill Score
SVM,25.0,0.0,0.0,31.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0
KNN,23.0,2.0,0.0,31.0,1.0,0.92,0.939394,0.964286,0.96875,0.035714,1.0,0.08,0.060606,0.0,0.96,0.92,0.927178
LSTM,25.0,0.0,5.0,26.0,0.83871,1.0,1.0,0.910714,0.912281,0.089286,0.833333,0.0,0.0,0.16129,0.919355,0.83871,0.822785


#### Average of all

In [45]:
# lstm_metrics_df.loc[1,:]
df = pd.DataFrame(columns=performance_metrics)
fold_count = 'Average'
for i, each_df in enumerate(all_dfs):
    temp_df = each_df.xs(fold_count)
    temp_df.name = all_names[i]
    df = df.append(temp_df)
df

Unnamed: 0,True Negative,False Positive,False Negative,True Positivity,Sensitivity,Specificity,Precision,Accuracy,F1 Score,Error Rate,Negative Predicted Value,False Positve Rate,False Discovery Rate,False Negative Rate,Balanced Accuracy,True Skill Statistics,Heidke Skill Score
SVM,20.4,0.8,0.5,35.2,0.985612,0.96221,0.978164,0.977193,0.981599,0.022807,0.977236,0.03779,0.021836,0.014388,0.973911,0.947823,0.950586
KNN,19.7,1.5,0.2,35.5,0.994997,0.930415,0.959341,0.970113,0.976513,0.029887,0.987778,0.069585,0.040659,0.005003,0.962706,0.925412,0.934116
LSTM,19.6,1.6,4.7,31.0,0.86731,0.924323,0.953199,0.889317,0.905012,0.110683,0.819082,0.075677,0.046801,0.13269,0.895816,0.791633,0.769537


### Observation
* I consider balanced accuracy to be the optimal metric to find the best model.
* The case being, SVM is the model which is giving the highest balanced accuracy.

### Why is SVM performing better?
* SVM doesn't get affected by outliers
* It does not suffer from overfitting
* It is more efficient than other ML algorithms listed here