# Installing Dependencies

In [None]:
%pip install numpy
%pip install pandas
%pip install matplotlib
%pip install scikit-learn
%pip install scipy

# Ex 1.

Compare the performance of a $kNN$ with $k = 5$ and a naïve Bayes with Gaussian \
assumption (consider all remaining parameters as default):

### a)
Plot two boxplots with the fold accuracies for each classifier. Is there one \
more stable than the other regarding performance? Why do you think that is the \
case? Explain. 
> (**Refer to the report for the explanation**)

In [None]:
# Code for a)

### b)
Report the accuracy of both models, this time scaling the data with a \
Min-Max scaler before training the models. Explain the impact that this \
preprocessing step has on the performance of each model, providing an \
explanation for the results. 
> (**Refer to the report for the explanation**)

In [None]:
# Code for b)

### c)
Using `scipy`, test the hypothesis “the $kNN$ model is statistically superior to \
naïve Bayes regarding accuracy”, asserting whether it is true.
> (**Refer to the report for the explanation**)

In [None]:
# Code for c)

# Ex 2.
Using a 80-20 train-test split, vary the number of neighbors of a $kNN$ classifier using \
$k = \{1, 5, 10, 20, 30\}$. Additionally, for each $k$, train one classifier using uniform weights \
and distance weights.

### a)
Plot the train and test accuracy for each model

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier


# Create dataframe from heart-disease.csv
df = pd.read_csv('./heart-disease.csv', sep=',')

X = df.drop('target', axis=1) # Input variables
y = df['target'] # Target variable

# Split dataset into a 80-20 training set
X_train, X_test, y_train, y_test = train_test_split(X.values, y, train_size=0.8, random_state=0)

k_values = [1, 5, 10, 20, 30]
uni_weights_train_accuracies = []
dist_weights_train_accuracies = []

uni_weights_test_accuracies = []
dist_weights_test_accuracies = []

for k in k_values:
    # Create classifiers: Uniform weights and Distance weights
    uni_weights_clf = KNeighborsClassifier(n_neighbors=k, weights='uniform')
    dist_weights_clf = KNeighborsClassifier(n_neighbors=k, weights='distance')

    uni_weights_clf.fit(X_train, y_train)
    dist_weights_clf.fit(X_train, y_train)

    # Calculate training accuracy
    uni_weights_train_accuracies.append(accuracy_score(y_train, uni_weights_clf.predict(X_train)))
    dist_weights_train_accuracies.append(accuracy_score(y_train, dist_weights_clf.predict(X_train)))

    # Calculate testing accuracy
    uni_weights_test_accuracies.append(accuracy_score(y_test, uni_weights_clf.predict(X_test)))
    dist_weights_test_accuracies.append(accuracy_score(y_test, dist_weights_clf.predict(X_test)))

# Plot the figures
plt.figure(figsize=(10, 6))
plt.plot(k_values, uni_weights_train_accuracies, label='Training Accuracy', marker='o')
plt.plot(k_values, uni_weights_test_accuracies, label='Testing Accuracy', marker='o')
plt.xlabel('k value')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(True, which='both', linestyle='--', linewidth=0.5, alpha=0.5)
plt.title('Training and Testing Accuracy for multiple kNN classifiers with uniform weights')
plt.show()

plt.figure(figsize=(10, 6))
plt.plot(k_values, dist_weights_train_accuracies, label='Training Accuracy', marker='o')
plt.plot(k_values, dist_weights_test_accuracies, label='Testing Accuracy', marker='o')
plt.xlabel('k value')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(True, which='both', linestyle='--', linewidth=0.5, alpha=0.5)
plt.title('Training and Testing Accuracy for multiple kNN classifiers with distance weights')
plt.show()

### b)
Explain the impact of increasing the neighbors on the generalization ability of \
the models.
> (**Refer to the report for the explanation**)

# Ex 3.
Considering the unique properties of the `heart-disease.csv` dataset, identify two
possible difficulties of the naïve Bayes model used in the previous exercises when learning
from the given dataset.
> (**Refer to the report for the explanation**)