# KNN with Parameter Tuning

In this project, I implement from scratch the k-NN algorithm on a dataset containing breast cancer data. 
I also implement my own cross-validation algorithm in order to tune the model.

In [34]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math 
import statistics

In [35]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

breast_cancer_train = pd.read_csv("breast_cancer_train.csv")
breast_cancer_test = pd.read_csv("breast_cancer_test.csv")
breast_cancer_train.info()
breast_cancer_test.info()

breast_cancer_train['target'].value_counts(normalize=1)

x_train = breast_cancer_train.drop('target', axis=1)
x_test = breast_cancer_test.drop('target', axis=1)

y_train = breast_cancer_train[['target']]
y_test = breast_cancer_test[['target']]

x_train_normalized = ((x_train - x_train.min()) / (x_train.max() - x_train.min()) )
x_test_normalized = ((x_test - x_test.min()) / (x_test.max() - x_test.min()) )

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 455 entries, 0 to 454
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              455 non-null    float64
 1   mean texture             455 non-null    float64
 2   mean perimeter           455 non-null    float64
 3   mean area                455 non-null    float64
 4   mean smoothness          455 non-null    float64
 5   mean compactness         455 non-null    float64
 6   mean concavity           455 non-null    float64
 7   mean concave points      455 non-null    float64
 8   mean symmetry            455 non-null    float64
 9   mean fractal dimension   455 non-null    float64
 10  radius error             455 non-null    float64
 11  texture error            455 non-null    float64
 12  perimeter error          455 non-null    float64
 13  area error               455 non-null    float64
 14  smoothness error         4

In [64]:
def euclidean_distance(p, q):
    sum_cathetus = 0
    for i in range(0, len(p)):
        cathetus = (p[i] - q[i])**2
        sum_cathetus += cathetus
    
    return math.sqrt(sum_cathetus)

    
def knn(x_train_normalized, y_train, query_point, k, distance_func):
    distances = []
    for index in range(0, len(x_train_normalized)):
        d = distance_func(x_train_normalized.iloc[index], query_point)
        distances.append((d,index))
    distances = sorted(distances)

    k_nearest_neighbors = distances[0:k]

    labels = []
    for neighbor in k_nearest_neighbors:
        index = neighbor[1]
        label = y_train['target'][index]
        labels.append(label)
    prediction = statistics.mode(labels)

    return prediction

def assign_evaluation_metric_label(target, prediction):

    if target == 0 and prediction == 0:
        return 'TN'
    if target == 0 and prediction == 1:
        return 'FP'
    if target == 1 and prediction == 0:
        return 'FN'
    if target == 1 and prediction == 1:
        return 'TP'
    
def get_evaluation_metrics(df):
    evaluation_metric_labels = list(df.apply(lambda row: assign_evaluation_metric_label(row['target'], row['predictions']), axis=1))
    tp = evaluation_metric_labels.count('TP')
    fp = evaluation_metric_labels.count('FP')
    fn = evaluation_metric_labels.count('FN')
    tn = evaluation_metric_labels.count('TN')
    recall = tp/(tp+fn)
    precision = tp/(tp+fp) if tp != 0 else 0
    fpr = fp/(fp + tn)
    f1_score = 2*((precision*recall)/(precision+recall)) if precision + recall != 0 else 0
    accuracy = (tp + tn)/df.shape[0]
    conf_matrix = np.array([[tp, fp],
                       [fn, tn]])
    
    return  conf_matrix, recall, precision, fpr, f1_score, accuracy


### Evaluating k = 31 with Euclidean Distance

In [65]:
predictions = []

for i in range(0, len(x_test_normalized)):
    prediction = knn(x_train_normalized, y_train, x_test_normalized.iloc[i], 31, euclidean_distance)
    predictions.append(prediction)

y_predictions = pd.DataFrame(data={'predictions':predictions})

y_test_predictions = pd.concat([y_test, y_predictions], axis=1)

conf_matrix, recall, precision, fpr, f1_score, accuracy = get_evaluation_metrics(y_test_predictions)

print(f' Accuracy: {accuracy}\n Recall: {recall}\n Precision: {precision}\n F1-Score: {f1_score}')

 Accuracy: 0.8947368421052632
 Recall: 0.7857142857142857
 Precision: 1.0
 F1-Score: 0.88
