In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier as KNN

# Load data
iris = load_iris()
X_data = pd.DataFrame(iris.data, columns=iris.feature_names)
y_data = pd.DataFrame(iris.target, columns=['label'])
y_data['species'] = y_data['label'].map({0: 'setosa', 1: 'versicolor', 2: 'virginica'}).astype('category')

# Data preparation
test_frac = 1/3
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data['label'], test_size=test_frac, shuffle=True, stratify=y_data['label'])

# Inspection of the training set
# Short summary of training set
print("Summary of the training set:")
print(X_train.describe())

# Compute the correlation matrix
correlation_matrix = X_train.corr()
print("Correlation matrix:")
print(correlation_matrix)

# Visualization
print("Visualization:")
for feature in X_train.columns:
    sns.boxplot(x=y_train, y=X_train[feature])
    plt.title(f'Boxplot of {feature} grouped by species')
    plt.show()

sns.pairplot(X_train.join(y_train, how='outer'), hue='label')
plt.title('Pairplot of features colored by species')
plt.show()

# Classification with KNN
# Classification using all the features
# Create a baseline k-NN classifier with k=3
knn_classifier = KNN(n_neighbors=3)
knn_classifier.fit(X_train, y_train)

# Classification errors
train_error_predict = knn_classifier.predict(X_train)
train_error = 1 - knn_classifier.score(X_train, y_train)
print("Classification error on training set (using predict method):", train_error)

train_score = knn_classifier.score(X_train, y_train)
print("Classification error on training set (using score method):", 1 - train_score)

test_error = 1 - knn_classifier.score(X_test, y_test)
print("Classification error on test set:", test_error)

# Confusion matrix
conf_mat = confusion_matrix(y_test, knn_classifier.predict(X_test))
print("Confusion matrix:")
print(conf_mat)

# Classification using only 2 features
features_2D = ['petal length (cm)', 'sepal width (cm)']
X_train_2D = X_train[features_2D]

knn_classifier_2D = KNN(n_neighbors=3)
knn_classifier_2D.fit(X_train_2D, y_train)

# Decision boundary
# Mesh the input space
h = .02
x_min, x_max = X_train_2D[features_2D[0]].min() - 1, X_train_2D[features_2D[0]].max() + 1
y_min, y_max = X_train_2D[features_2D[1]].min() - 1, X_train_2D[features_2D[1]].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
mesh_data = np.c_[xx.ravel(), yy.ravel()]

# Predict labels of the mesh's nodes
y_pred_mesh = knn_classifier_2D.predict(mesh_data)

# Plot decision boundary
plt.figure()
plt.contourf(xx, yy, y_pred_mesh.reshape(xx.shape), alpha=0.8)
plt.scatter(X_train_2D[features_2D[0]], X_train_2D[features_2D[1]], c=y_train, edgecolors='k', s=20)
plt.title("Decision boundary of k-NN classifier with 2 features")
plt.xlabel(features_2D[0])
plt.ylabel(features_2D[1])
plt.show()

# Tune the k parameter manually
X_train_small, X_valid, y_train_small, y_valid = train_test_split(X_train_2D, y_train, test_size=0.2, random_state=42)

# Fit the model for different values of k and predict on the validation set
validation_errors = []
for k in range(1, 21):
    knn_classifier = KNN(n_neighbors=k)
    knn_classifier.fit(X_train_small, y_train_small)
    validation_errors.append(1 - knn_classifier.score(X_valid, y_valid))

best_k = np.argmin(validation_errors) + 1
print("Best k value found using manual tuning:", best_k)

# Bonus: with GridSearchCV
from sklearn.model_selection import GridSearchCV
grid = {'n_neighbors': np.arange(1, 20), 'weights': ['uniform', 'distance']}
knn_gs = GridSearchCV(estimator=KNN(), param_grid=grid, cv=5, scoring='accuracy')
knn_gs.fit(X_train_2D, y_train)
print("Best parameters found using GridSearchCV:", knn_gs.best_params_)
print("Test error with best parameters:", 1 - knn_gs.best_estimator_.score(X_test[features_2D], y_test))
