**KNN Classifier**

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import fbeta_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import make_scorer,fbeta_score
from sklearn.metrics import (
    confusion_matrix,
    precision_score,
    recall_score,
    accuracy_score,
)

Make Evaluation Metric Functions for ease of use below

In [2]:
def f2_func(y_true, y_pred):
    f2_score = fbeta_score(y_true, y_pred, beta=2, average = 'weighted')
    return f2_score

def my_f2_scorer():
    return make_scorer(f2_func)



In [3]:
X_train = pd.read_csv('train_X_In-Car-Rec.csv')
y_train = pd.read_csv('train_y_In-Car-Rec.csv')
X_test = pd.read_csv('test_X_In-Car-Rec.csv')
y_test = pd.read_csv('test_y_In-Car-Rec.csv')

Finding the Max number of n_neighbors according to the convention of square root of n

In [4]:
round(np.sqrt(X_train.shape[0]))



101

Setting up the parameter grid for my grid search, going up to 101 because it is the square root of the observations in the training set.

In [5]:
param_grid = {
    'n_neighbors': list(range(1,101,5)),
    'metric': ['euclidean', 'cosine']
}


In [6]:
# Create the full pipeline
pipeline = Pipeline([("knn", KNeighborsClassifier())])

Creating Grid search since there are not a large number of parameters to tune for KNeighborsClassifier model. 

In [7]:
# Create GridSearchCV object
grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=10, scoring=my_f2_scorer())

# Fit the pipeline (including hyperparameter tuning) to your data
grid_search.fit(X_train, y_train.values.ravel())

In [8]:
# Store best estimator
best_estimator = grid_search.best_estimator_

# Get the best parameters and score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

best_params, best_score

({'metric': 'euclidean', 'n_neighbors': 36}, 0.6980771345788905)

In [9]:
knn = KNeighborsClassifier()
knn.set_params(**best_params)

In [10]:
# Train the final pipeline
knn.fit(X_train, y_train.values.ravel())

In [11]:
# Predict on the test set
y_pred = knn.predict(X_test)

In [12]:
# Evaluate the pipeline on the test data
score = knn.score(X_test, y_test)

# Calculate f1_score on the test data
f2_score = fbeta_score(y_test, y_pred, average='weighted', beta=2)
print(f'F2Score for the KNN Model is: '+str(f2_score))

F2Score for the KNN Model is: 0.707296744500565


In [13]:
# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Precision
precision = precision_score(y_test, y_pred, average="weighted")
print(f"\nPrecision (weighted): {precision:.4f}")

# Recall
recall = recall_score(y_test, y_pred, average="weighted")
print(f"Recall (weighted): {recall:.4f}")

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

Confusion Matrix:
[[ 632  446]
 [ 291 1168]]

Precision (weighted): 0.7071
Recall (weighted): 0.7095
Accuracy: 0.7095
