# ML400 Exercises: KNN & Clustering

These exercises cover K-Nearest Neighbors classification, KMeans clustering,
hierarchical clustering, DBSCAN, and PCA for dimensionality reduction.

**Difficulty increases with each exercise.**

In [None]:
# ============================================================
# Setup: Run this cell first
# ============================================================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans, DBSCAN
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import silhouette_score, accuracy_score
from sklearn.datasets import load_iris, make_blobs, make_moons
from sklearn.pipeline import Pipeline
from scipy.cluster.hierarchy import dendrogram, linkage

np.random.seed(42)
RANDOM_STATE = 42

print("Setup complete.")

---
## Exercise 1: KNN Classifier -- Accuracy vs. k (Scaled vs. Unscaled)

**Goal:** Train KNN classifiers on the Iris dataset with different values of k,
both with and without feature scaling, and plot accuracy vs. k.

**Tasks:**
1. Load the Iris dataset and split 80/20.
2. For k in [1, 3, 5, 7, 9], train KNN **without** scaling and record test accuracy.
3. For k in [1, 3, 5, 7, 9], train KNN **with** `StandardScaler` and record test accuracy.
4. Plot both accuracy curves on the same figure.
5. Print the best k for each approach.

In [None]:
# Exercise 1 - Starter Code

from sklearn.datasets import load_iris
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

iris = load_iris()
X, y = iris.data, iris.target
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

k_values = [1, 3, 5, 7, 9]

# TODO 1: Train KNN without scaling for each k
# unscaled_accuracies = []
# for k in k_values:
#     knn = KNeighborsClassifier(n_neighbors=k)
#     knn.fit(X_train, y_train)
#     acc = accuracy_score(y_test, knn.predict(X_test))
#     unscaled_accuracies.append(acc)

# TODO 2: Scale the data and train KNN for each k
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)
# scaled_accuracies = []
# for k in k_values:
#     knn = KNeighborsClassifier(n_neighbors=k)
#     knn.fit(X_train_scaled, y_train)
#     acc = accuracy_score(y_test, knn.predict(X_test_scaled))
#     scaled_accuracies.append(acc)

# TODO 3: Plot accuracy vs k
# plt.figure(figsize=(8, 5))
# plt.plot(k_values, unscaled_accuracies, 'o-', label='Unscaled')
# plt.plot(k_values, scaled_accuracies, 's-', label='Scaled')
# plt.xlabel('k (Number of Neighbors)')
# plt.ylabel('Test Accuracy')
# plt.title('KNN: Accuracy vs k')
# plt.legend()
# plt.xticks(k_values)
# plt.tight_layout()
# plt.show()

# TODO 4: Print best k for each approach
# best_unscaled_k = k_values[np.argmax(unscaled_accuracies)]
# best_scaled_k = k_values[np.argmax(scaled_accuracies)]
# print(f"Best k (unscaled): {best_unscaled_k} with accuracy {max(unscaled_accuracies):.4f}")
# print(f"Best k (scaled):   {best_scaled_k} with accuracy {max(scaled_accuracies):.4f}")

### Solution

<details><summary>Click to reveal solution</summary>

```python
from sklearn.datasets import load_iris
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

iris = load_iris()
X, y = iris.data, iris.target
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

k_values = [1, 3, 5, 7, 9]

# 1. Without scaling
unscaled_accuracies = []
for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    acc = accuracy_score(y_test, knn.predict(X_test))
    unscaled_accuracies.append(acc)

# 2. With scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
scaled_accuracies = []
for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)
    acc = accuracy_score(y_test, knn.predict(X_test_scaled))
    scaled_accuracies.append(acc)

# 3. Plot
plt.figure(figsize=(8, 5))
plt.plot(k_values, unscaled_accuracies, 'o-', label='Unscaled')
plt.plot(k_values, scaled_accuracies, 's-', label='Scaled')
plt.xlabel('k (Number of Neighbors)')
plt.ylabel('Test Accuracy')
plt.title('KNN: Accuracy vs k')
plt.legend()
plt.xticks(k_values)
plt.tight_layout()
plt.show()

# 4. Best k
best_unscaled_k = k_values[np.argmax(unscaled_accuracies)]
best_scaled_k = k_values[np.argmax(scaled_accuracies)]
print(f"Best k (unscaled): {best_unscaled_k} with accuracy {max(unscaled_accuracies):.4f}")
print(f"Best k (scaled):   {best_scaled_k} with accuracy {max(scaled_accuracies):.4f}")
```

</details>

---
## Exercise 2: KMeans -- Elbow Method and Silhouette Score

**Goal:** Apply KMeans to a blob dataset, use the elbow method (inertia) and
silhouette score to find the optimal number of clusters.

**Tasks:**
1. Generate data with `make_blobs` (4 centers, `random_state=42`).
2. Run KMeans for k = 2 through 8, recording inertia and silhouette scores.
3. Plot the elbow curve (inertia vs. k).
4. Plot silhouette score vs. k.
5. Identify the optimal k from both methods.

In [None]:
# Exercise 2 - Starter Code

from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Generate blob data with 4 centers
X, y_true = make_blobs(
    n_samples=500, centers=4, cluster_std=1.0, random_state=42
)

plt.figure(figsize=(6, 4))
plt.scatter(X[:, 0], X[:, 1], c=y_true, cmap='viridis', alpha=0.5, s=20)
plt.title('True Clusters')
plt.show()

k_range = range(2, 9)

# TODO 1: Run KMeans for each k, record inertia and silhouette scores
# inertias = []
# silhouette_scores = []
# for k in k_range:
#     kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
#     labels = kmeans.fit_predict(X)
#     inertias.append(kmeans.inertia_)
#     silhouette_scores.append(silhouette_score(X, labels))

# TODO 2: Plot elbow curve and silhouette score side by side
# fig, axes = plt.subplots(1, 2, figsize=(14, 5))
#
# axes[0].plot(list(k_range), inertias, 'o-')
# axes[0].set_xlabel('Number of Clusters (k)')
# axes[0].set_ylabel('Inertia')
# axes[0].set_title('Elbow Method')
#
# axes[1].plot(list(k_range), silhouette_scores, 's-')
# axes[1].set_xlabel('Number of Clusters (k)')
# axes[1].set_ylabel('Silhouette Score')
# axes[1].set_title('Silhouette Score')
#
# plt.tight_layout()
# plt.show()

# TODO 3: Print the optimal k
# best_k = list(k_range)[np.argmax(silhouette_scores)]
# print(f"Best k by silhouette score: {best_k}")

### Solution

<details><summary>Click to reveal solution</summary>

```python
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

X, y_true = make_blobs(
    n_samples=500, centers=4, cluster_std=1.0, random_state=42
)

k_range = range(2, 9)

# 1. Run KMeans for each k
inertias = []
silhouette_scores_list = []
for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    labels = kmeans.fit_predict(X)
    inertias.append(kmeans.inertia_)
    silhouette_scores_list.append(silhouette_score(X, labels))

# 2. Plot
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].plot(list(k_range), inertias, 'o-')
axes[0].set_xlabel('Number of Clusters (k)')
axes[0].set_ylabel('Inertia')
axes[0].set_title('Elbow Method')

axes[1].plot(list(k_range), silhouette_scores_list, 's-')
axes[1].set_xlabel('Number of Clusters (k)')
axes[1].set_ylabel('Silhouette Score')
axes[1].set_title('Silhouette Score')

plt.tight_layout()
plt.show()

# 3. Optimal k
best_k = list(k_range)[np.argmax(silhouette_scores_list)]
print(f"Best k by silhouette score: {best_k}")
print(f"Silhouette scores: {dict(zip(k_range, [f'{s:.3f}' for s in silhouette_scores_list]))}")
```

</details>

---
## Exercise 3: Hierarchical Clustering with Dendrogram

**Goal:** Create a dendrogram using `scipy` hierarchical clustering on a small
dataset to visualize the cluster merging process.

**Tasks:**
1. Generate a small dataset with `make_blobs` (50 samples, 3 centers).
2. Compute the linkage matrix using Ward's method.
3. Plot the dendrogram.
4. Draw a horizontal line at the cut height for 3 clusters.
5. Extract cluster labels and visualize the clustered data.

In [None]:
# Exercise 3 - Starter Code

from sklearn.datasets import make_blobs
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster

# Generate small dataset
X, y_true = make_blobs(
    n_samples=50, centers=3, cluster_std=1.0, random_state=42
)

# TODO 1: Compute linkage matrix (Ward's method)
# Z = linkage(X, method='ward')

# TODO 2: Plot dendrogram
# plt.figure(figsize=(12, 6))
# dendrogram(Z, leaf_rotation=90, leaf_font_size=8)
# plt.title('Hierarchical Clustering Dendrogram')
# plt.xlabel('Sample Index')
# plt.ylabel('Distance')

# TODO 3: Draw horizontal line for 3-cluster cut
# Choose a distance threshold that yields 3 clusters
# plt.axhline(y=..., color='r', linestyle='--', label='3-cluster cut')
# plt.legend()
# plt.tight_layout()
# plt.show()

# TODO 4: Extract cluster labels for 3 clusters
# labels = fcluster(Z, t=3, criterion='maxclust')

# TODO 5: Visualize clustered data
# fig, axes = plt.subplots(1, 2, figsize=(12, 5))
# axes[0].scatter(X[:, 0], X[:, 1], c=y_true, cmap='viridis', s=50)
# axes[0].set_title('True Labels')
# axes[1].scatter(X[:, 0], X[:, 1], c=labels, cmap='viridis', s=50)
# axes[1].set_title('Hierarchical Clustering Labels')
# plt.tight_layout()
# plt.show()

### Solution

<details><summary>Click to reveal solution</summary>

```python
from sklearn.datasets import make_blobs
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster

X, y_true = make_blobs(
    n_samples=50, centers=3, cluster_std=1.0, random_state=42
)

# 1. Linkage
Z = linkage(X, method='ward')

# 2. Dendrogram
plt.figure(figsize=(12, 6))
dendrogram(Z, leaf_rotation=90, leaf_font_size=8)
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Sample Index')
plt.ylabel('Distance')

# 3. Cut line (pick a height between the 2-cluster and 3-cluster merges)
# Inspect Z to find appropriate height; a value like 15 typically works
cut_height = Z[-3, 2] + (Z[-2, 2] - Z[-3, 2]) / 2
plt.axhline(y=cut_height, color='r', linestyle='--', label=f'3-cluster cut (h={cut_height:.1f})')
plt.legend()
plt.tight_layout()
plt.show()

# 4. Extract labels
labels = fcluster(Z, t=3, criterion='maxclust')

# 5. Visualize
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
axes[0].scatter(X[:, 0], X[:, 1], c=y_true, cmap='viridis', s=50)
axes[0].set_title('True Labels')
axes[1].scatter(X[:, 0], X[:, 1], c=labels, cmap='viridis', s=50)
axes[1].set_title('Hierarchical Clustering Labels')
plt.tight_layout()
plt.show()
```

</details>

---
## Exercise 4: KMeans vs. DBSCAN on make_moons

**Goal:** Compare KMeans and DBSCAN on the `make_moons` dataset, which has
non-convex cluster shapes that KMeans struggles with.

**Tasks:**
1. Generate data with `make_moons(n_samples=300, noise=0.1)`.
2. Apply KMeans with k=2.
3. Apply DBSCAN with `eps=0.2` and `min_samples=5`.
4. Visualize results side by side (true labels, KMeans, DBSCAN).
5. Discuss why DBSCAN performs better on this dataset.

In [None]:
# Exercise 4 - Starter Code

from sklearn.datasets import make_moons
from sklearn.cluster import KMeans, DBSCAN

# Generate moon-shaped data
X, y_true = make_moons(n_samples=300, noise=0.1, random_state=42)

# TODO 1: Apply KMeans (k=2)
# kmeans = KMeans(n_clusters=2, random_state=42, n_init=10)
# kmeans_labels = kmeans.fit_predict(X)

# TODO 2: Apply DBSCAN (eps=0.2, min_samples=5)
# dbscan = DBSCAN(eps=0.2, min_samples=5)
# dbscan_labels = dbscan.fit_predict(X)

# TODO 3: Visualize all three side by side
# fig, axes = plt.subplots(1, 3, figsize=(18, 5))
#
# axes[0].scatter(X[:, 0], X[:, 1], c=y_true, cmap='viridis', s=20)
# axes[0].set_title('True Labels')
#
# axes[1].scatter(X[:, 0], X[:, 1], c=kmeans_labels, cmap='viridis', s=20)
# axes[1].set_title('KMeans (k=2)')
#
# axes[2].scatter(X[:, 0], X[:, 1], c=dbscan_labels, cmap='viridis', s=20)
# axes[2].set_title(f'DBSCAN (eps=0.2) - {len(set(dbscan_labels) - {-1})} clusters')
#
# plt.tight_layout()
# plt.show()

# TODO 4: Discuss why DBSCAN works better here
# Your answer: ...

### Solution

<details><summary>Click to reveal solution</summary>

```python
from sklearn.datasets import make_moons
from sklearn.cluster import KMeans, DBSCAN

X, y_true = make_moons(n_samples=300, noise=0.1, random_state=42)

# 1. KMeans
kmeans = KMeans(n_clusters=2, random_state=42, n_init=10)
kmeans_labels = kmeans.fit_predict(X)

# 2. DBSCAN
dbscan = DBSCAN(eps=0.2, min_samples=5)
dbscan_labels = dbscan.fit_predict(X)

# 3. Visualize
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

axes[0].scatter(X[:, 0], X[:, 1], c=y_true, cmap='viridis', s=20)
axes[0].set_title('True Labels')

axes[1].scatter(X[:, 0], X[:, 1], c=kmeans_labels, cmap='viridis', s=20)
axes[1].set_title('KMeans (k=2)')

n_clusters_dbscan = len(set(dbscan_labels) - {-1})
n_noise = np.sum(dbscan_labels == -1)
axes[2].scatter(X[:, 0], X[:, 1], c=dbscan_labels, cmap='viridis', s=20)
axes[2].set_title(f'DBSCAN (eps=0.2) - {n_clusters_dbscan} clusters, {n_noise} noise')

plt.tight_layout()
plt.show()

# 4. Discussion:
# KMeans assumes convex (spherical) clusters and uses centroids.
# It cannot handle the crescent-moon shape and splits the data incorrectly.
# DBSCAN is density-based and can find arbitrarily shaped clusters,
# making it much better suited for non-convex geometries like moons.
print("KMeans struggles with non-convex shapes because it uses centroids.")
print("DBSCAN is density-based and can discover arbitrarily shaped clusters.")
```

</details>

---
## Exercise 5: PCA + Clustering Pipeline

**Goal:** Apply PCA to reduce a 10-feature dataset to 2D, plot the explained
variance ratio, and then cluster the reduced data.

**Tasks:**
1. Generate a high-dimensional dataset with `make_blobs` (10 features, 4 centers).
2. Apply PCA and plot the cumulative explained variance ratio.
3. Reduce to 2 components and visualize the data.
4. Apply KMeans (k=4) on the 2D PCA data.
5. Visualize the true labels vs. KMeans labels on the PCA projection.

In [None]:
# Exercise 5 - Starter Code

from sklearn.datasets import make_blobs
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Generate 10-feature dataset
X, y_true = make_blobs(
    n_samples=500, n_features=10, centers=4,
    cluster_std=2.0, random_state=42
)
print(f"Original shape: {X.shape}")

# Scale the data first
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# TODO 1: Apply PCA (keep all components first to see variance)
# pca_full = PCA(random_state=42)
# pca_full.fit(X_scaled)

# TODO 2: Plot cumulative explained variance
# plt.figure(figsize=(8, 5))
# cumulative_var = np.cumsum(pca_full.explained_variance_ratio_)
# plt.plot(range(1, len(cumulative_var) + 1), cumulative_var, 'o-')
# plt.xlabel('Number of Components')
# plt.ylabel('Cumulative Explained Variance')
# plt.title('PCA: Cumulative Explained Variance')
# plt.axhline(y=0.95, color='r', linestyle='--', label='95% threshold')
# plt.legend()
# plt.tight_layout()
# plt.show()

# TODO 3: Reduce to 2 components
# pca_2d = PCA(n_components=2, random_state=42)
# X_2d = pca_2d.fit_transform(X_scaled)
# print(f"Reduced shape: {X_2d.shape}")
# print(f"Explained variance (2 components): {pca_2d.explained_variance_ratio_.sum():.4f}")

# TODO 4: Apply KMeans on 2D data
# kmeans = KMeans(n_clusters=4, random_state=42, n_init=10)
# kmeans_labels = kmeans.fit_predict(X_2d)

# TODO 5: Visualize true vs KMeans labels
# fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# axes[0].scatter(X_2d[:, 0], X_2d[:, 1], c=y_true, cmap='viridis', s=20, alpha=0.6)
# axes[0].set_title('True Labels (PCA 2D)')
# axes[0].set_xlabel('PC1')
# axes[0].set_ylabel('PC2')
# axes[1].scatter(X_2d[:, 0], X_2d[:, 1], c=kmeans_labels, cmap='viridis', s=20, alpha=0.6)
# axes[1].set_title('KMeans Labels (PCA 2D)')
# axes[1].set_xlabel('PC1')
# axes[1].set_ylabel('PC2')
# plt.tight_layout()
# plt.show()

### Solution

<details><summary>Click to reveal solution</summary>

```python
from sklearn.datasets import make_blobs
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

X, y_true = make_blobs(
    n_samples=500, n_features=10, centers=4,
    cluster_std=2.0, random_state=42
)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 1. Full PCA
pca_full = PCA()
pca_full.fit(X_scaled)

# 2. Cumulative variance
plt.figure(figsize=(8, 5))
cumulative_var = np.cumsum(pca_full.explained_variance_ratio_)
plt.plot(range(1, len(cumulative_var) + 1), cumulative_var, 'o-')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('PCA: Cumulative Explained Variance')
plt.axhline(y=0.95, color='r', linestyle='--', label='95% threshold')
plt.legend()
plt.tight_layout()
plt.show()

# 3. Reduce to 2D
pca_2d = PCA(n_components=2)
X_2d = pca_2d.fit_transform(X_scaled)
print(f"Reduced shape: {X_2d.shape}")
print(f"Explained variance (2 components): {pca_2d.explained_variance_ratio_.sum():.4f}")

# 4. KMeans
kmeans = KMeans(n_clusters=4, random_state=42, n_init=10)
kmeans_labels = kmeans.fit_predict(X_2d)

# 5. Visualize
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
axes[0].scatter(X_2d[:, 0], X_2d[:, 1], c=y_true, cmap='viridis', s=20, alpha=0.6)
axes[0].set_title('True Labels (PCA 2D)')
axes[0].set_xlabel('PC1')
axes[0].set_ylabel('PC2')
axes[1].scatter(X_2d[:, 0], X_2d[:, 1], c=kmeans_labels, cmap='viridis', s=20, alpha=0.6)
axes[1].set_title('KMeans Labels (PCA 2D)')
axes[1].set_xlabel('PC1')
axes[1].set_ylabel('PC2')
plt.tight_layout()
plt.show()

sil = silhouette_score(X_2d, kmeans_labels)
print(f"Silhouette score (PCA 2D + KMeans): {sil:.4f}")
```

</details>