# Machine Learning Fundamentals - Lecture 03

This is the Jupyter notebook for Lecture 03 of the Machine Learning Fundamentals
course.

In [92]:
# Import the required libraries using the commonly use short names (pd, sns, ...)
import numpy as np
import pandas as pd
import seaborn as sns

# The Path object from pathlib allows us to easily build paths in an
# OS-independent fashion
from pathlib import Path

# Load the required scikit-learn classes and functions
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier, export_text, plot_tree
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_absolute_error, mean_absolute_percentage_error
import matplotlib.pyplot as plt
from scipy import stats

# Set a nicer style for Seaborn plots
sns.set_style("darkgrid")

## Part 1: load and clean the Pokémon dataset

Here we just repeat the steps already done in the previous lectures, but in a
more succint way.

In [93]:
# Load the dataset (note the use of the Path object)
df = pd.read_csv(Path("Pokemon.csv"))

# It's not good practice to have column names with spaces and other non-standard
# characters, so let's fix this by renaming the columns to standard names
df.rename(columns={
    "Type 1" : "Type1",
    "Type 2" : "Type2",
    "Sp. Atk" : "SpAtk",
    "Sp. Def" : "SpDef",
}, inplace=True)

# Replace missing values in the "Type2" column with the string "None"
df["Type2"] = df["Type2"].fillna("None")

# Since primary and secondary types are essentially categories (and not just
# strings / objects), we can convert these columns to the category type
df["Type1"] = df["Type1"].astype("category")
df["Type2"] = df["Type2"].astype("category")

Before we proceed to the interesting part, we'll perform our data scaling and
train/test data splitting.

In [94]:
# Let's use all features except the Total, which can be considered redundant
# since it's the total of the other features
features = ["HP", "Attack", "Defense", "SpAtk", "SpDef", "Speed"]

# Get only the specified features
df_X = df[features]

# Standardize them
ss = StandardScaler()
X = ss.fit_transform(df_X)

# Our labels will be the legendary status
y_leg = df["Legendary"].to_numpy()

# Let's split our data into training (80%) and test (20%) sets
# Change the random_state parameter do split data in different ways
X_train, X_test, y_train, y_test = train_test_split(X, y_leg, test_size=0.2, random_state=42)

## Part 2: Implement our own $k$-Nearest Neighbors classifier and regressor

In [95]:
# Change this variable to change k for all the tests in this section
k_for_all = 5

In [96]:
def knn_classify(X_Train, y_train, X_test, k=5):
    dists = euclidean_distances(X_test, X_Train)
    
    idx_k_min = np.argpartition(dists, k, axis=1)[:, :k]

    labels_k_min = y_train[idx_k_min]
    
    num_pred = X_test.shape[0]
    
    maj_labels = np.zeros(num_pred, dtype=y_train.dtype)

    for i, row in enumerate(labels_k_min):
        
       values, counts = np.unique(row, return_counts=True)
       
       i_max = np.argmax(counts)
       
       maj_labels[i] = values[i_max]
       
    return maj_labels

In [97]:
y_pred_ours = knn_classify(X_train, y_train, X_test, k=k_for_all)
accuracy_ours = accuracy_score(y_test, y_pred_ours)

knnClf = KNeighborsClassifier(n_neighbors=k_for_all)
knnClf.fit(X_train, y_train)
y_pred_knn = knnClf.predict(X_test)
accuracy_knn = accuracy_score(y_test, y_pred_knn)

print(f"Accuracy (our kNN): {accuracy_ours:.4f}")
print(f"Accuracy (sklearn kNN): {accuracy_knn:.4f}")

Accuracy (our kNN): 0.9250
Accuracy (sklearn kNN): 0.9250


In [98]:
accuracy_score(y_pred_ours, y_pred_knn)

1.0

In [99]:
def knn_regression(X_Train, y_train, X_test, k=5):
    dists = euclidean_distances(X_test, X_Train)
    
    idx_k_min = np.argpartition(dists, k, axis=1)[:, :k]

    return y_train[idx_k_min].mean(axis=1)

In [100]:
y_total = df["Total"].to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y_total, test_size=0.2, random_state=42)

In [101]:
y_regr_ours = knn_regression(X_train, y_train, X_test, k=k_for_all)
mean_absolute_error(y_test, y_regr_ours)

13.832500000000005

In [102]:
mean_absolute_percentage_error(y_test, y_regr_ours)

0.031841346483808264

In [103]:
knnRegr = KNeighborsRegressor()
knnRegr.fit(X_train, y_train)
y_regr_knn = knnRegr.predict(X_test)
mean_absolute_error(y_test, y_regr_knn)

13.832500000000005

In [104]:
mean_absolute_percentage_error(y_test, y_regr_ours)

0.031841346483808264

In [None]:
mean_absolute_percentage_error(y_regr_ours, y_regr_knn)