# 5. Model Evaluation (Hands-on with KNN)

This notebook practices the evaluation concepts from class with simple and readable code.

Main goals:
- Build confusion matrices manually.
- Compute classification metrics manually (accuracy, precision, sensitivity/recall, specificity, F1).
- Understand ROC curve and AUC with thresholds.
- Compute regression metrics manually (MAE, MSE, RMSE, R²).
- Compare models using `KNeighborsClassifier` and `KNeighborsRegressor`.


## 0. Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display

from sklearn.datasets import load_breast_cancer, load_iris, load_diabetes, make_classification
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor

np.set_printoptions(suppress=True)


## 1. Helper functions (manual implementations)

In [None]:
def safe_div(num, den):
    return num / den if den != 0 else 0.0


def train_test_split_manual(X, y, test_size=0.2, seed=42, stratify=False):
    X = np.array(X)
    y = np.array(y)

    rng = np.random.default_rng(seed)
    n = len(y)

    if not stratify:
        indices = np.arange(n)
        rng.shuffle(indices)

        n_test = int(round(test_size * n))
        test_idx = indices[:n_test]
        train_idx = indices[n_test:]
    else:
        train_idx = []
        test_idx = []

        for cls in np.unique(y):
            cls_idx = np.where(y == cls)[0]
            rng.shuffle(cls_idx)

            n_cls_test = int(round(test_size * len(cls_idx)))
            test_idx.extend(cls_idx[:n_cls_test])
            train_idx.extend(cls_idx[n_cls_test:])

        train_idx = np.array(train_idx)
        test_idx = np.array(test_idx)

        rng.shuffle(train_idx)
        rng.shuffle(test_idx)

    return X[train_idx], X[test_idx], y[train_idx], y[test_idx]


def confusion_matrix_binary_manual(y_true, y_pred, positive_label=1):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)

    tp = int(np.sum((y_true == positive_label) & (y_pred == positive_label)))
    fp = int(np.sum((y_true != positive_label) & (y_pred == positive_label)))
    tn = int(np.sum((y_true != positive_label) & (y_pred != positive_label)))
    fn = int(np.sum((y_true == positive_label) & (y_pred != positive_label)))

    return {'TP': tp, 'FP': fp, 'TN': tn, 'FN': fn}


def classification_metrics_from_counts(tp, fp, tn, fn):
    accuracy = safe_div(tp + tn, tp + fp + tn + fn)
    sensitivity = safe_div(tp, tp + fn)  # recall, TPR
    specificity = safe_div(tn, tn + fp)  # TNR
    precision = safe_div(tp, tp + fp)    # PPV
    f1 = safe_div(2 * precision * sensitivity, precision + sensitivity)

    return {
        'accuracy': accuracy,
        'sensitivity_recall_tpr': sensitivity,
        'specificity_tnr': specificity,
        'precision_ppv': precision,
        'f1_score': f1
    }


def evaluate_binary_predictions(y_true, y_pred, positive_label=1):
    counts = confusion_matrix_binary_manual(y_true, y_pred, positive_label=positive_label)
    metrics = classification_metrics_from_counts(
        counts['TP'], counts['FP'], counts['TN'], counts['FN']
    )
    return {**counts, **metrics}


def confusion_matrix_multiclass_manual(y_true, y_pred, labels):
    labels = list(labels)
    idx_of = {label: i for i, label in enumerate(labels)}

    cm = np.zeros((len(labels), len(labels)), dtype=int)

    for yt, yp in zip(y_true, y_pred):
        cm[idx_of[yt], idx_of[yp]] += 1

    return cm


def per_class_metrics_from_confusion(cm):
    n_classes = cm.shape[0]
    rows = []

    for i in range(n_classes):
        tp = cm[i, i]
        fp = cm[:, i].sum() - tp
        fn = cm[i, :].sum() - tp

        precision = safe_div(tp, tp + fp)
        recall = safe_div(tp, tp + fn)
        f1 = safe_div(2 * precision * recall, precision + recall)

        rows.append({
            'class_index': i,
            'support': int(cm[i, :].sum()),
            'precision': precision,
            'recall': recall,
            'f1_score': f1
        })

    return pd.DataFrame(rows)


def regression_metrics_manual(y_true, y_pred):
    y_true = np.array(y_true, dtype=float)
    y_pred = np.array(y_pred, dtype=float)

    error = y_true - y_pred
    mae = np.mean(np.abs(error))
    mse = np.mean(error ** 2)
    rmse = np.sqrt(mse)

    y_mean = np.mean(y_true)
    ss_res = np.sum((y_true - y_pred) ** 2)
    ss_tot = np.sum((y_true - y_mean) ** 2)
    r2 = 1 - safe_div(ss_res, ss_tot)

    return {'MAE': mae, 'MSE': mse, 'RMSE': rmse, 'R2': r2}


## 2. Binary classification evaluation (confusion matrix + metrics)

In [None]:
# Dataset: Breast Cancer (binary classification)
data_bc = load_breast_cancer()
X_bc = data_bc.data[:, :3] # Use only the first 5 features for a more complex dataset
y_bc = data_bc.target

print('Dataset size:', len(y_bc))
print('Class names:', list(data_bc.target_names))

class_counts = pd.Series(y_bc).value_counts().sort_index()
class_share = (class_counts / len(y_bc) * 100).round(2)
display(pd.DataFrame({
    'class_index': class_counts.index,
    'count': class_counts.values,
    'percentage': class_share.values
}))


In [None]:
X_train_bc, X_test_bc, y_train_bc, y_test_bc = train_test_split_manual(
    X_bc, y_bc, test_size=0.25, seed=7, stratify=True
)

clf = KNeighborsClassifier(n_neighbors=7)
clf.fit(X_train_bc, y_train_bc)
y_pred_bc = clf.predict(X_test_bc)

summary = evaluate_binary_predictions(y_test_bc, y_pred_bc, positive_label=1)
summary_df = pd.DataFrame([summary]).round(4)
display(summary_df)


In [None]:
cm_counts = confusion_matrix_binary_manual(y_test_bc, y_pred_bc, positive_label=1)

cm_table = pd.DataFrame(
    [[cm_counts['TP'], cm_counts['FN']], [cm_counts['FP'], cm_counts['TN']]],
    index=['Actual Positive (1)', 'Actual Negative (0)'],
    columns=['Pred Positive (1)', 'Pred Negative (0)']
)
display(cm_table)

plt.figure(figsize=(5, 4))
plt.imshow(cm_table.values, cmap='Blues')
for i in range(cm_table.shape[0]):
    for j in range(cm_table.shape[1]):
        plt.text(j, i, int(cm_table.iloc[i, j]), ha='center', va='center', color='black')
plt.xticks(range(cm_table.shape[1]), cm_table.columns, rotation=20)
plt.yticks(range(cm_table.shape[0]), cm_table.index)
plt.title('Binary confusion matrix (KNN)')
plt.colorbar()
plt.tight_layout()
plt.show()


### Exercise 1
Try changing `n_neighbors` (for example: 1, 3, 4, 5, 6, 7, 15).

Questions:
- Does accuracy always move in the same direction as F1?
- Which metric would you prioritize for a medical problem and why?

## 3. Accuracy can be misleading (imbalanced classes)

In [None]:
X_imb, y_imb = make_classification(
    n_samples=5000,
    n_features=8,
    n_informative=4,
    n_redundant=2,
    class_sep=.5,
    weights=[0.95, 0.05],
    flip_y=0.01,
    random_state=4
)

X_train_i, X_test_i, y_train_i, y_test_i = train_test_split_manual(
    X_imb, y_imb, test_size=0.30, seed=11, stratify=True
)

print('Class distribution in test set:')
display(pd.Series(y_test_i).value_counts(normalize=True).sort_index().rename('share'))


In [None]:
# Baseline model: predict all samples as class 0 (majority class)
baseline_pred = np.zeros_like(y_test_i)
baseline_eval = evaluate_binary_predictions(y_test_i, baseline_pred, positive_label=1)

# KNN model
knn_imb = KNeighborsClassifier(n_neighbors=7)
knn_imb.fit(X_train_i, y_train_i)
knn_pred = knn_imb.predict(X_test_i)
knn_eval = evaluate_binary_predictions(y_test_i, knn_pred, positive_label=1)

comparison = pd.DataFrame([
    {'model': 'Always predict 0', **baseline_eval},
    {'model': 'KNN (k=7)', **knn_eval}
]).round(4)
comparison


### Exercise 2
Change the class imbalance in `make_classification` (for example: `weights=[0.90, 0.10]` and `weights=[0.98, 0.02]`).

Questions:
- What happens to accuracy vs sensitivity (recall)?
- Why can high accuracy still be a bad model in imbalanced problems?

## 4. ROC curve and AUC (advanced)

In [None]:
def roc_points_manual(y_true, y_prob, thresholds):
    y_true = np.array(y_true)
    y_prob = np.array(y_prob)

    rows = []

    for t in thresholds:
        y_pred_t = (y_prob >= t).astype(int)
        c = confusion_matrix_binary_manual(y_true, y_pred_t, positive_label=1)

        tpr = safe_div(c['TP'], c['TP'] + c['FN'])
        fpr = safe_div(c['FP'], c['FP'] + c['TN'])

        rows.append({'threshold': t, 'TPR': tpr, 'FPR': fpr})

    roc_df = pd.DataFrame(rows)
    return roc_df


def auc_manual(fpr, tpr):
    order = np.argsort(fpr)
    fpr_sorted = np.array(fpr)[order]
    tpr_sorted = np.array(tpr)[order]
    return np.trapezoid(tpr_sorted, fpr_sorted)


In [None]:
thresholds = np.linspace(1.0, 0.0, 51)

roc_rows = []
curves = {}

for k in [3, 15]:
    model = KNeighborsClassifier(n_neighbors=k)
    model.fit(X_train_i, y_train_i)

    # Probability of positive class (class 1)
    y_prob = model.predict_proba(X_test_i)[:, 1]

    roc_df = roc_points_manual(y_test_i, y_prob, thresholds)
    auc_value = auc_manual(roc_df['FPR'], roc_df['TPR'])

    curves[k] = roc_df
    roc_rows.append({'k': k, 'AUC_manual': round(auc_value, 4)})

pd.DataFrame(roc_rows)


In [None]:
plt.figure(figsize=(6, 5))

for k, roc_df in curves.items():
    auc_value = auc_manual(roc_df['FPR'], roc_df['TPR'])
    plt.plot(roc_df['FPR'], roc_df['TPR'], marker='o', markersize=3, label=f'k={k} (AUC={auc_value:.3f})')

plt.plot([0, 1], [0, 1], '--', color='gray', label='random model')
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.title('Manual ROC curves (KNN)')
plt.legend()
plt.grid(alpha=0.3)
plt.show()


### Exercise 3
Try different `k` values and a different number of thresholds.

Questions:
- Which model has better AUC?
- Why do we need many thresholds for ROC/AUC?

## 5. Multiclass evaluation (Iris confusion matrix)

In [None]:
iris = load_iris()
X_iris = iris.data[:, :2]  # Use only the first 2 features for a more complex dataset
y_iris = iris.target
label_names = iris.target_names

X_train_ir, X_test_ir, y_train_ir, y_test_ir = train_test_split_manual(
    X_iris, y_iris, test_size=0.25, seed=5, stratify=True
)

knn_ir = KNeighborsClassifier(n_neighbors=5)
knn_ir.fit(X_train_ir, y_train_ir)
y_pred_ir = knn_ir.predict(X_test_ir)

labels = [0, 1, 2]
cm_multi = confusion_matrix_multiclass_manual(y_test_ir, y_pred_ir, labels=labels)

cm_df = pd.DataFrame(cm_multi, index=label_names, columns=label_names)
display(cm_df)


In [None]:
plt.figure(figsize=(5, 4))
plt.imshow(cm_df.values, cmap='Blues')
for i in range(cm_df.shape[0]):
    for j in range(cm_df.shape[1]):
        plt.text(j, i, int(cm_df.iloc[i, j]), ha='center', va='center', color='black')
plt.xticks(range(cm_df.shape[1]), cm_df.columns)
plt.yticks(range(cm_df.shape[0]), cm_df.index)
plt.title('Multiclass confusion matrix (Iris, KNN)')
plt.xlabel('Predicted class')
plt.ylabel('Actual class')
plt.colorbar()
plt.tight_layout()
plt.show()

per_class_df = per_class_metrics_from_confusion(cm_multi)
per_class_df['class_name'] = label_names
display(per_class_df[['class_name', 'support', 'precision', 'recall', 'f1_score']].round(4))

macro_f1 = per_class_df['f1_score'].mean()
overall_accuracy = np.trace(cm_multi) / cm_multi.sum()
print('Overall accuracy:', round(overall_accuracy, 4))
print('Macro F1       :', round(macro_f1, 4))


### Exercise 4
Change `n_neighbors` and `seed` in the Iris section.

Questions:
- Which class is hardest to classify?
- Is overall accuracy enough to understand class-level performance?

## 6. Regression evaluation (MAE, MSE, RMSE, R²)

In [None]:
diabetes = load_diabetes()
X_reg = diabetes.data
y_reg = diabetes.target

X_train_r, X_test_r, y_train_r, y_test_r = train_test_split_manual(
    X_reg, y_reg, test_size=0.25, seed=9, stratify=False
)

rows = []
for k in [1, 3, 5, 11, 25, 51]:
    reg = KNeighborsRegressor(n_neighbors=k)
    reg.fit(X_train_r, y_train_r)

    pred = reg.predict(X_test_r)
    metrics = regression_metrics_manual(y_test_r, pred)

    rows.append({'k': k, **metrics})

reg_df = pd.DataFrame(rows).sort_values('RMSE')
reg_df.round(4)


In [None]:
best_k = int(reg_df.iloc[0]['k'])
print('Best k by RMSE:', best_k)

best_reg = KNeighborsRegressor(n_neighbors=best_k)
best_reg.fit(X_train_r, y_train_r)
best_pred = best_reg.predict(X_test_r)

best_metrics = regression_metrics_manual(y_test_r, best_pred)
print(pd.Series(best_metrics).round(4))

plt.figure(figsize=(6, 4))
plt.scatter(y_test_r, best_pred, alpha=0.7)
plt.xlabel('Actual values')
plt.ylabel('Predicted values')
plt.title(f'Diabetes regression (KNN, k={best_k})')

min_v = min(y_test_r.min(), best_pred.min())
max_v = max(y_test_r.max(), best_pred.max())
plt.plot([min_v, max_v], [min_v, max_v], '--', color='red')
plt.grid(alpha=0.3)
plt.show()


### Exercise 5
- Try a different split (`seed` and `test_size`).
- Compare model choice when selecting by MAE vs RMSE vs R².

Questions:
- Which metric penalizes large errors more?
- Can two models have similar RMSE but different R²?

## 7. Conclusions
- Evaluation metrics help compare models objectively.
- Accuracy alone is often insufficient, especially with class imbalance.
- Confusion matrix + precision/recall/F1 gives better detail for classification.
- ROC/AUC summarizes behavior across multiple thresholds.
- For regression, choose MAE/MSE/RMSE/R² based on the type of error you care about.