# 03 - Classification (10-fold CV + F1)

This notebook runs a 10-fold stratified cross-validation with GridSearch for three classifiers (KNN, RandomForest, SVM) and compares macro F1.

Inputs: `etl/datasets/movie_features.csv`

Outputs: `etl/reports/model_comparison.csv`

Run the cells sequentially. The implementation mirrors `notebooks/03_classification.py` but is split into runnable notebook cells.

In [1]:
# Imports
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import f1_score, make_scorer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import json
import warnings

warnings.filterwarnings("ignore")

print('Imports OK')

Imports OK


In [2]:
# Helper functions: load + prepare

def load_data(path: Path):
    if not path.exists():
        raise FileNotFoundError(f"Dataset not found: {path}\nRun `python scripts/fetch_data.py` and `python run_etl.py` (or the individual transforms) to create it.")
    df = pd.read_csv(path)
    return df


def prepare_features(df: pd.DataFrame):
    # Features observed in movie_features.csv: avg_rating, rating_count, rating_std, year, tmdbId
    features = ["avg_rating", "rating_count", "rating_std", "year", "tmdbId"]
    missing = [c for c in features if c not in df.columns]
    if missing:
        raise ValueError(f"Missing expected feature columns: {missing}")

    # Label
    if "label_genre" not in df.columns:
        raise ValueError("Missing label column 'label_genre' in dataset")

    X = df[features].copy()
    y = df["label_genre"].copy()

    # Drop rows with missing label
    mask = y.notna()
    X = X[mask]
    y = y[mask]

    # For simplicity drop rows with any missing numeric feature
    X = X.dropna()
    y = y.loc[X.index]

    # Convert types
    X["rating_count"] = X["rating_count"].astype(float)
    X["year"] = X["year"].astype(float)
    X["tmdbId"] = pd.to_numeric(X["tmdbId"], errors="coerce")
    X = X.fillna(0)

    # Encode labels
    le = LabelEncoder()
    y_enc = le.fit_transform(y)

    return X.values, y_enc, le, X.columns.tolist()

print('Helper functions defined')

Helper functions defined


In [3]:
# Model selection function

def run_model_selection(X, y, feature_names, out_report: Path):
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    scorer = make_scorer(f1_score, average="macro")

    candidates = [
        (
            "KNN",
            KNeighborsClassifier(),
            {"n_neighbors": [3, 5, 7], "weights": ["uniform", "distance"]},
        ),
        (
            "RandomForest",
            RandomForestClassifier(random_state=42),
            {"n_estimators": [100, 200], "max_depth": [None, 10, 20]},
        ),
        (
            "SVM",
            SVC(random_state=42),
            {"C": [0.1, 1.0, 10.0], "kernel": ["rbf", "linear"]},
        ),
    ]

    results = []

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    for name, model, grid in candidates:
        print(f"\n>>> Tuning {name} ...")
        gs = GridSearchCV(
            estimator=model,
            param_grid=grid,
            scoring=scorer,
            cv=cv,
            n_jobs=-1,
            refit=True,
            verbose=0,
        )
        gs.fit(X_scaled, y)

        best = gs.best_estimator_
        best_params = gs.best_params_
        # Re-run cross_val_score on the best estimator to get mean/std
        scores = cross_val_score(best, X_scaled, y, cv=cv, scoring=scorer, n_jobs=-1)

        row = {
            "model": name,
            "mean_cv_f1_macro": float(scores.mean()),
            "std_cv_f1_macro": float(scores.std()),
            "best_params": json.dumps(best_params, ensure_ascii=False),
            "n_features": len(feature_names),
            "features": ",".join(feature_names),
        }
        results.append(row)

        print(f"{name} best params: {best_params}")
        print(f"{name} CV f1_macro: {scores.mean():.4f} ± {scores.std():.4f}")

    df_res = pd.DataFrame(results).sort_values("mean_cv_f1_macro", ascending=False)
    out_report.parent.mkdir(parents=True, exist_ok=True)
    df_res.to_csv(out_report, index=False, encoding="utf-8")
    print(f"\nSaved model comparison report -> {out_report}")
    return df_res

print('Model selection function defined')

Model selection function defined


In [5]:
# Run pipeline (execute this cell)
from IPython.display import display

# Resolve dataset path robustly (works when kernel cwd is repo root or notebooks/)
project_root = Path.cwd().resolve()
possible_paths = [
    project_root / "etl" / "datasets" / "movie_features.csv",
    project_root.parent / "etl" / "datasets" / "movie_features.csv",
]

data_path = None
for p in possible_paths:
    if p.exists():
        data_path = p
        break

if data_path is None:
    raise FileNotFoundError(
        f"Dataset not found in expected locations: {possible_paths}\nPlease run `python scripts/fetch_data.py` and the ETL transforms to produce `etl/intermediate/*.parquet` and then `etl/datasets/movie_features.csv`"
    )

out_report = project_root / "etl" / "reports" / "model_comparison.csv"

print("Loading dataset:", data_path)
df = load_data(data_path)
print(f"Dataset shape: {df.shape}")

X, y, le, feature_names = prepare_features(df)
print(f"Prepared X shape: {X.shape}, y shape: {y.shape}, n_classes: {len(le.classes_)}")

results = run_model_selection(X, y, feature_names, out_report)

print('\nResults DataFrame:')
display(results)

print('\nSaved report at', out_report)


Loading dataset: C:\Users\Admin\Desktop\PJ_KTDL\MovieRecProject_N5\etl\datasets\movie_features.csv
Dataset shape: (9675, 8)
Prepared X shape: (9668, 5), y shape: (9668,), n_classes: 18

>>> Tuning KNN ...
KNN best params: {'n_neighbors': 3, 'weights': 'distance'}
KNN CV f1_macro: 0.0912 ± 0.0098

>>> Tuning RandomForest ...
RandomForest best params: {'max_depth': 20, 'n_estimators': 100}
RandomForest CV f1_macro: 0.0968 ± 0.0062

>>> Tuning SVM ...
SVM best params: {'C': 10.0, 'kernel': 'rbf'}
SVM CV f1_macro: 0.0741 ± 0.0051

Saved model comparison report -> C:\Users\Admin\Desktop\PJ_KTDL\MovieRecProject_N5\notebooks\etl\reports\model_comparison.csv

Results DataFrame:


Unnamed: 0,model,mean_cv_f1_macro,std_cv_f1_macro,best_params,n_features,features
1,RandomForest,0.096845,0.006159,"{""max_depth"": 20, ""n_estimators"": 100}",5,"avg_rating,rating_count,rating_std,year,tmdbId"
0,KNN,0.091197,0.009825,"{""n_neighbors"": 3, ""weights"": ""distance""}",5,"avg_rating,rating_count,rating_std,year,tmdbId"
2,SVM,0.074095,0.005108,"{""C"": 10.0, ""kernel"": ""rbf""}",5,"avg_rating,rating_count,rating_std,year,tmdbId"



Saved report at C:\Users\Admin\Desktop\PJ_KTDL\MovieRecProject_N5\notebooks\etl\reports\model_comparison.csv


In [6]:
# Ensure results are also saved into repo root etl/reports (not inside notebooks/etl/...)
if 'results' in globals():
    repo_root = data_path.parents[2]
    correct_out = repo_root / 'etl' / 'reports' / 'model_comparison.csv'
    correct_out.parent.mkdir(parents=True, exist_ok=True)
    results.to_csv(correct_out, index=False, encoding='utf-8')
    print('Saved corrected report at', correct_out)
else:
    print('Variable `results` not found in notebook state; run the pipeline cell first.')

Saved corrected report at C:\Users\Admin\Desktop\PJ_KTDL\MovieRecProject_N5\etl\reports\model_comparison.csv
