In [33]:
import pandas as pd
import numpy as np
from scipy.stats import randint
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    classification_report,
    confusion_matrix
)

import joblib

In [34]:
file_train = "/Users/bofanchen/Desktop/data_mining/[3] full dataset preparation and baseline model evaluation/train_data.csv"
file_val = "/Users/bofanchen/Desktop/data_mining/[3] full dataset preparation and baseline model evaluation/val_data.csv"


train = pd.read_csv(file_train)
val = pd.read_csv(file_val)

X_train, y_train = train.drop("is_canceled", axis=1), train["is_canceled"]
X_val, y_val = val.drop("is_canceled", axis=1), val["is_canceled"]

In [35]:
knn = KNeighborsClassifier(n_neighbors=5, weights='uniform')
knn.fit(X_train, y_train)

y_pred = knn.predict(X_val)

print("Accuracy:", accuracy_score(y_val, y_pred))
print("Precision:", precision_score(y_val, y_pred))
print("Recall:", recall_score(y_val, y_pred))
print("F1 Score:", f1_score(y_val, y_pred))
print("\nClassification Report:\n", classification_report(y_val, y_pred))


Accuracy: 0.806667783548333
Precision: 0.7640065146579804
Recall: 0.6997315035799523
F1 Score: 0.7304578013080038

Classification Report:
               precision    recall  f1-score   support

         0.0       0.83      0.87      0.85     11203
         1.0       0.76      0.70      0.73      6704

    accuracy                           0.81     17907
   macro avg       0.80      0.79      0.79     17907
weighted avg       0.80      0.81      0.80     17907



In [36]:
knn = KNeighborsClassifier()

param_distributions = {
    "n_neighbors":  np.arange(1, 51),
    "weights":      ["uniform", "distance"],
    "metric":       ["euclidean", "manhattan", "minkowski"],
    "p":            [1, 2],
    "leaf_size":    np.arange(15, 61, 5)
}


cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

rand_search = RandomizedSearchCV(
    estimator            = knn,
    param_distributions   = param_distributions,
    n_iter               = 50,
    scoring              = "f1",
    cv                   = cv,
    n_jobs               = -1,
    random_state         = 42,
    verbose              = 1
)


rand_search.fit(X_train, y_train)

print(f"Best F1 (CV): {rand_search.best_score_:.4f}")
print("Best hyper-parameters:", rand_search.best_params_)




best_knn = rand_search.best_estimator_

y_pred   = best_knn.predict(X_val)


val_accuracy = accuracy_score(y_val, y_pred)
val_precision = precision_score(y_val, y_pred)
val_recall = recall_score(y_val, y_pred)
val_f1 = f1_score(y_val, y_pred)

print(f"\nValidation Accuracy: {val_accuracy:.4f}")
print(f"Validation Precision: {val_precision:.4f}")
print(f"Validation Recall: {val_recall:.4f}")
print(f"Validation F1 Score: {val_f1:.4f}\n")

print("Classification report:")
print(classification_report(y_val, y_pred, digits=3))
print("Confusion matrix:")
print(confusion_matrix(y_val, y_pred))

Fitting 5 folds for each of 50 candidates, totalling 250 fits


Best F1 (CV): 0.7617
Best hyper-parameters: {'weights': 'distance', 'p': 2, 'n_neighbors': np.int64(39), 'metric': 'manhattan', 'leaf_size': np.int64(50)}

Validation Accuracy: 0.8483
Validation Precision: 0.8478
Validation Recall: 0.7251
Validation F1 Score: 0.7816

Classification report:
              precision    recall  f1-score   support

         0.0      0.849     0.922     0.884     11203
         1.0      0.848     0.725     0.782      6704

    accuracy                          0.848     17907
   macro avg      0.848     0.824     0.833     17907
weighted avg      0.848     0.848     0.846     17907

Confusion matrix:
[[10330   873]
 [ 1843  4861]]


In [37]:
joblib.dump(best_knn, "knn_model.joblib")

['knn_model.joblib']