# Import Libraries and files

In [1]:
from time import time
import warnings
warnings.filterwarnings('ignore')

In [2]:
import os 
import sys

dir_path = os.getcwd().split(os.path.sep)
root_index = dir_path.index('Machine_Learning_project')
root_path = os.path.sep.join(dir_path[:root_index + 1])
sys.path.append(root_path + '/code/')
sys.path.append(root_path + '/code/data_loaders/')
sys.path.append(root_path + '/code/utils_sklearn')
sys.path.append(root_path + '/code/metric/')


In [3]:
import numpy as np
from data import *
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn_utils import *
from data_cup import *
from Trainer_Cup import *
from mee import *
from sklearn.model_selection import train_test_split

# KNN Regressor and KNN Classifier

In this notebook, we have used the ***KNN algorithm*** for both classification (on the Monk's datasets) and regression (on the Cup dataset). The KNN algorithm is a non-parametric method used for classification and regression tasks. It works by finding the k nearest neighbors of a given data point and using their class labels (for classification) or target values (for regression) to make a prediction.

For the Monk's datasets (Monk 1, Monk 2, and Monk 3), the code is using the ***KNN classifier***. The KNN classifier predicts the class label of a new data point by finding the k nearest neighbors in the training data and assigning the majority class label among those neighbors to the new data point.

The code performs a grid search over different hyperparameters of the KNN classifier, such as the number of neighbors (n_neighbors), the weight function (weights), the distance metric (metric), and the Minkowski distance parameter (p). This grid search helps find the best combination of hyperparameters that maximize the performance of the KNN classifier on the training data.

For the Cup dataset, the code is using the ***KNN regressor***. The KNN regressor predicts the target value of a new data point by finding the k nearest neighbors in the training data and taking the average of their target values.

The code applies polynomial features of degree 2 to the Cup dataset using the PolynomialFeatures class from scikit-learn. This step helps capture non-linear relationships in the data, which can improve the performance of the KNN regressor.

 # MONK 1


In [19]:
m1_train = MonksDataset('monk1_train')
m1_test= MonksDataset('monk1_test')

In [20]:
X_dev, y_dev, X_test, y_test = get_monks_data(m1_train, m1_test)

In [21]:
X_train_encoded_m1 = pd.get_dummies(data = X_dev, columns = ['a1', 'a2', 'a3', 'a4', 'a5', 'a6'], dtype = int)
X_test_encoded_m1 = pd.get_dummies(data = X_test, columns = ['a1', 'a2', 'a3', 'a4', 'a5', 'a6'], dtype = int)

In [22]:
encoder = OneHotEncoder(sparse_output=False)

In [23]:
X_train_encoded_m1 = encoder.fit_transform(X_dev)

X_test_encoded_m1 = encoder.transform(X_test)

In [24]:
sqrt_n = int(np.sqrt(len(X_train_encoded_m1)))

In [25]:
knn_hyperparam = {
    "n_neighbors": np.arange(2, int(len(X_train_encoded_m1)/2)),
    "weights": ["uniform", "distance"],
    "metric": ["minkowski", "cosine"],
    "p": [1, 2, 3, 5, 7]
}

In [None]:
knn_grid_search = CustomGridSearch(estimator = 'knn', hyperparameters=knn_hyperparam, cv_splits = 5)
knn_grid_search.fit(X_train_encoded_m1, y_dev.values.ravel())
print()

accuracy_train, mse_train = knn_grid_search.evaluate(X_train_encoded_m1, y_dev)
print()
print("Accuracy Train:", accuracy_train)
print("MSE Train:", mse_train)

accuracy_test, mse_test = knn_grid_search.evaluate(X_test_encoded_m1, y_test)
print()
print("Accuracy Test:", accuracy_test)
print("MSE Test:", mse_test)

# MONK 2

In [19]:
m2_train = MonksDataset('monk2_train')
m2_test= MonksDataset('monk2_test')
X_dev, y_dev, X_test, y_test = get_monks_data(m2_train, m2_test)

In [20]:
X_train_encoded_m2 = pd.get_dummies(data = X_dev, columns = ['a1', 'a2', 'a3', 'a4', 'a5', 'a6'], dtype = int)
X_test_encoded_m2 = pd.get_dummies(data = X_test, columns = ['a1', 'a2', 'a3', 'a4', 'a5', 'a6'], dtype = int)

In [21]:
X_train_encoded_m2 = encoder.fit_transform(X_dev)
X_test_encoded_m2 = encoder.transform(X_test)

In [22]:
sqrt_n = int(np.sqrt(len(X_train_encoded_m2)))

In [23]:
knn_hyperparam = {
    "n_neighbors": np.arange(2, int(len(X_train_encoded_m2)/2)),
    "weights": ["uniform", "distance"],
    "metric": ["minkowski", "cosine"],
    "p": [1, 2, 3, 5, 7]
}

In [None]:
knn_grid_search = CustomGridSearch(estimator = 'knn', hyperparameters=knn_hyperparam)
knn_grid_search.fit(X_train_encoded_m2, y_dev.values.ravel())

accuracy_train, mse_train = knn_grid_search.evaluate(X_train_encoded_m2, y_dev)
print()
print("Accuracy Train:", accuracy_train)
print("MSE Train:", mse_train)

accuracy_test, mse_test = knn_grid_search.evaluate(X_test_encoded_m2, y_test)
print()
print("Accuracy Test:", accuracy_test)
print("MSE Test:", mse_test)

# MONK 3

In [25]:
m3_train = MonksDataset('monk3_train')
m3_test= MonksDataset('monk3_test')
X_dev, y_dev, X_test, y_test = get_monks_data(m3_train, m3_test)

In [26]:
X_train_encoded_m3 = pd.get_dummies(data = X_dev, columns = ['a1', 'a2', 'a3', 'a4', 'a5', 'a6'], dtype = int)
X_test_encoded_m3 = pd.get_dummies(data = X_test, columns = ['a1', 'a2', 'a3', 'a4', 'a5', 'a6'], dtype = int)

In [27]:
X_train_encoded_m3 = encoder.fit_transform(X_dev)

X_test_encoded_m3 = encoder.transform(X_test)

In [None]:
knn_grid_search = CustomGridSearch(estimator = 'knn', hyperparameters=knn_hyperparam)
knn_grid_search.fit(X_train_encoded_m3, y_dev.values.ravel())

accuracy_train, mse_train = knn_grid_search.evaluate(X_train_encoded_m3, y_dev)
print()
print("Accuracy Train:", accuracy_train)
print("MSE Train:", mse_train)

accuracy_test, mse_test = knn_grid_search.evaluate(X_test_encoded_m3, y_test)
print()
print("Accuracy Test:", accuracy_test)
print("MSE Test:", mse_test)

# CUP

In [4]:
cup = CupDataset('Cup_tr')
df = cup.data


In [5]:
# Split data into train/validation and test sets
cup.split_data(test_size=0.1, random_state=0)

# X_dev and y_dev represent the features and labels of the development set (train/validation combined), X_final_test and y_final_test represent the features and labels of the final test set
X_dev,  X_final_test, y_dev, y_final_test = cup.get_splits()

# Further split the development set (X_dev, y_dev) into training and internal test sets
X_train, X_internal_test, y_train, y_internal_test = train_test_split(X_dev, y_dev, test_size=0.111, random_state=0)

In [6]:
poly = PolynomialFeatures(degree=2)
X_train_poly = np.arctanh(poly.fit_transform(X_train)[:,1:])
X_internal_test_poly = np.arctanh(poly.transform(X_internal_test)[:,1:])
X_final_test_poly = np.arctanh(poly.transform(X_final_test)[:,1:])
X_dev_poly = np.arctanh(poly.transform(X_dev)[:,1:])

In [7]:
knn_hyperparam_cup = {
    "n_neighbors": np.arange(2, 40),
    "weights": ["uniform", "distance"],
    "p": [1, 2]
}

In [None]:
knn_grid_search = CustomGridSearch(estimator = 'knn_reg', cv_strategy = 'kfold', hyperparameters=knn_hyperparam_cup)
knn_grid_search.fit(X_train_poly, y_train)


In [9]:
best_params = {'n_neighbors': 4, 'p': 2, 'weights': 'distance'}

In [10]:
best_knn = KNeighborsRegressor(**best_params)

In [None]:
best_knn.fit(X_train_poly, y_train)

In [None]:
mse_train = mean_squared_error(y_train, best_knn.predict(X_train_poly))
mee_train = MEE(y_train, best_knn.predict(X_train_poly))

print(mse_train, mee_train)

In [None]:
mse_internal = mean_squared_error(y_internal_test, best_knn.predict(X_internal_test_poly))
mee_internal = MEE(y_internal_test, best_knn.predict(X_internal_test_poly))

print(mse_internal, mee_internal)

##### Model trained on validation set and estimate error on final test set





In [None]:
best_knn.fit(X_dev_poly, y_dev)

In [None]:
mse_dev = mean_squared_error(y_dev, best_knn.predict(X_dev_poly))
mee_dev = MEE(y_dev, best_knn.predict(X_dev_poly))

print(mse_dev, mee_dev)

In [17]:
test_pred = best_knn.predict(X_final_test_poly)

In [None]:
mse_test = mean_squared_error(y_final_test, test_pred)
mee_test = MEE(y_final_test, test_pred)
print(f"Test loss: {mse_test:.4f}, Test MEE: {mee_test:.4f}")