### Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.feature_selection import VarianceThreshold
import matplotlib.pyplot as plt

### Reading in data


In [2]:
train = pd.read_csv("../Dataset/Train.csv")
test = pd.read_csv("../Dataset/Test.csv")

### Feature Extraction (Counts)

In [None]:
import importlib
import feature_extraction
importlib.reload(feature_extraction)
from feature_extraction import *

y_train, X_train, y_test, X_test = get_counts(train, test, 3, (1, 1))

## k-Nearest Neighbours


### Baseline K-Nearest Neighbours Model

In [None]:
model = KNeighborsClassifier(int(np.sqrt(X_train.shape[0]))) #Baseline take Sqrt(no. of rows of X_train)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = [False, True])
cm_display.plot()
plt.show()

tn, fp, fn, tp = cm.ravel()
accuracy = (tp + tn) / (tp + tn + fp + fn)
sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)
mcc = (tp * tn - fp * fn) / ((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)) ** 0.5

print({f'accuracy: {accuracy}'})
print({f'sensitivity: {sensitivity}'})
print({f'specificity: {specificity}'})
print({f'mcc: {mcc}'})

### kNN Hyperparameter Tuning with GridSearch

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, matthews_corrcoef

def mcc_scorer(y_test, y_pred):
    return matthews_corrcoef(y_test, y_pred)

param_grid = {
    'n_neighbors': np.arange(50, 201, 10),  # Number of neighbors to use
    'weights': ['distance'],  # Weight function used in prediction
    'metric': ['euclidean'],  # Distance metric for tree
    'p': [1] # Power parameter for the Minkowski metric
}

tuned_model = KNeighborsClassifier()

mcc_scorer = make_scorer(mcc_scorer)

grid_search = GridSearchCV(estimator = tuned_model, param_grid = param_grid, cv = 3, scoring = mcc_scorer, n_jobs = -1, verbose = 2)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print(best_params)

### kNN Model After Hyperparameter Tuning

In [None]:
best_model = grid_search.best_estimator_
tuned_y_pred = best_model.predict(X_test)

cm = confusion_matrix(y_test, tuned_y_pred)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = ["False", "True"])
cm_display.plot()
plt.show()

tn, fp, fn, tp = cm.ravel()
accuracy = (tp + tn) / (tp + tn + fp + fn)
sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)
mcc = (tp * tn - fp * fn) / ((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)) ** 0.5

print({f'accuracy: {accuracy}'})
print({f'sensitivity: {sensitivity}'})
print({f'specificity: {specificity}'})
print({f'mcc: {mcc}'})

### Using VarianceThreshold for feature selection (kNN)

In [None]:
mcc_arr = []
sensitivity_arr = []
specificity_arr = []
acccuracy_arr = []
variance_values = np.arange(0.1, 2, 0.1)

for val in variance_values:
    X_train, X_test = selecting_high_variance_features(X_train, X_test, val)

    model = KNeighborsClassifier(n_neighbors = 50 , weights = 'distance', metric = 'euclidean' , p = 1) # put in best parameters here
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    sensitivity = tp / (tp + fn)
    specificity = tn / (tn + fp)
    mcc = (tp * tn - fp * fn) / ((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)) ** 0.5

    acccuracy_arr.append(accuracy)
    sensitivity_arr.append(sensitivity)
    specificity_arr.append(specificity)
    mcc_arr.append(mcc)

plt.plot(variance_values, acccuracy_arr, label = 'accuracy')
plt.plot(variance_values, mcc_arr, label = 'mcc')
plt.plot(variance_values, specificity_arr, label = 'specificity')
plt.plot(variance_values, sensitivity_arr, label = 'sensitivity')

plt.xlabel('Variance')
plt.ylabel('Metrics')
plt.legend()

plt.show()

best_var_threshold = variance_values[mcc_arr.index(max(mcc_arr))]
acc = acccuracy_arr[mcc_arr.index(max(mcc_arr))]
sens = sensitivity_arr[mcc_arr.index(max(mcc_arr))]
spec = specificity_arr[mcc_arr.index(max(mcc_arr))]
mcc_best = max(mcc_arr)
print(f'best threshold for variance for feature selection: {best_var_threshold}')
print('with corresponding metrics:')
print(f'accuracy: {acc}')
print(f'sensitivity: {sens}')
print(f'specificity: {spec}')
print(f'mcc: {mcc_best}')