In [4]:
# Configuration overrides from Stage 1/2 notebooks
# Set dataset path, target, and task type based on What_should_we_keep.ipynb

import os

data_path = 'data/US_Accidents_March23.csv'
target_column = 'Severity'
task_type = 'classification'
primary_metric = 'f1'

# Stage 2 artifacts
use_removed_columns_file = True
removed_cols_path = os.path.join('outputs', 'removed_columns_stage2.json')
selected_doc_path = os.path.join('outputs', 'selected_features_doc.csv')
use_selected_doc = True


In [7]:
# Optional: Install required packages (skip if already installed)
import sys, subprocess
pkgs = ["pandas", "numpy", "scikit-learn", "matplotlib", "seaborn", "joblib"]
try:
    import sklearn, pandas, numpy
except Exception:
    for p in pkgs:
        subprocess.check_call([sys.executable, "-m", "pip", "install", p])


# Imports and configuration
import warnings
warnings.filterwarnings("ignore")


import json
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.model_selection import train_test_split, StratifiedKFold, KFold, cross_validate, GridSearchCV
from sklearn.metrics import (
    accuracy_score, precision_recall_fscore_support, roc_auc_score, confusion_matrix,
    mean_absolute_error, mean_squared_error, r2_score, make_scorer
)
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer


# Models
from sklearn.dummy import DummyClassifier, DummyRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.svm import SVC, SVR
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, GradientBoostingRegressor


try:
    from xgboost import XGBClassifier, XGBRegressor  # optional
    XGB_AVAILABLE = True
except Exception:
    XGB_AVAILABLE = False


RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)


# Metric helpers
def compute_classification_metrics(y_true, y_pred, y_proba=None):
    acc = accuracy_score(y_true, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
    roc = None
    try:
        if y_proba is not None:
            if len(np.unique(y_true)) == 2:
                roc = roc_auc_score(y_true, y_proba[:, 1])
            else:
                roc = roc_auc_score(y_true, y_proba, multi_class="ovr")
    except Exception:
        pass
    cm = confusion_matrix(y_true, y_pred)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1, "roc_auc": roc, "confusion_matrix": cm}


def compute_regression_metrics(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    return {"mae": mae, "mse": mse, "rmse": rmse, "r2": r2}


classification_scorers = {
    "accuracy": make_scorer(accuracy_score),
    "f1": make_scorer(lambda yt, yp: precision_recall_fscore_support(yt, yp, average="weighted")[2]),
}
regression_scorers = {
    "rmse": make_scorer(lambda yt, yp: np.sqrt(mean_squared_error(yt, yp)), greater_is_better=False),
    "r2": make_scorer(r2_score),
}


# Custom metrics
from sklearn.metrics import fbeta_score
def fbeta_weighted(y_true, y_pred, beta=2):
    return fbeta_score(y_true, y_pred, beta=beta, average="weighted")
custom_classification_scorer = make_scorer(lambda yt, yp: fbeta_weighted(yt, yp, beta=2))


def smape(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    denom = (np.abs(y_true) + np.abs(y_pred))
    denom = np.where(denom == 0, 1, denom)
    return np.mean(2.0 * np.abs(y_pred - y_true) / denom)
custom_regression_scorer = make_scorer(smape, greater_is_better=False)

# Phase 4: Model Selection and Evaluation

This notebook implements training, evaluation, custom metrics, cross-validation, hyperparameter optimization, and model comparison across multiple model families. Configure the task type and dataset path in Cell 1. Stage 2 selections are applied automatically from `outputs/selected_features_doc.csv` or `outputs/removed_columns_stage2.json`.


In [5]:
# Helper: Apply Stage 2 selection using documentation CSV or removed-columns JSON

import pandas as pd
import json
import os

def apply_stage2_selection(df: pd.DataFrame, target: str,
                           selected_doc_csv: str = selected_doc_path,
                           removed_json: str = removed_cols_path,
                           prefer_doc: bool = use_selected_doc):
    df2 = df.copy()
    if prefer_doc and os.path.exists(selected_doc_csv):
        doc = pd.read_csv(selected_doc_csv)
        kept = doc.loc[doc['Kept?'].str.lower() == 'yes', 'Feature'].tolist()
        # Ensure target present
        if target not in kept:
            kept.append(target)
        kept = [c for c in kept if c in df2.columns]
        print(f"Using selected_features_doc.csv; keeping {len(kept)} columns.")
        return df2[kept]
    # Fallback to removed-columns file
    if os.path.exists(removed_json):
        with open(removed_json, 'r') as f:
            removed_sets = json.load(f)
        drop_cols = []
        for _, cols in removed_sets.items():
            drop_cols.extend([c for c in cols if c in df2.columns])
        if drop_cols:
            print(f"Dropping {len(drop_cols)} columns from removed_columns_stage2.json")
            df2 = df2.drop(columns=list(set(drop_cols)))
    return df2


In [8]:
# Reload data using Stage 2 selection before splitting
assert os.path.exists(data_path), f"Data file not found: {data_path}. Please set data_path."

df = pd.read_csv(data_path)
df = apply_stage2_selection(df, target_column)

assert target_column in df.columns, f"Target column '{target_column}' not in data."
X = df.drop(columns=[target_column])
y = df[target_column]

# Re-identify column types after selection
numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X.select_dtypes(exclude=[np.number]).columns.tolist()
print(f"Numeric features: {len(numeric_features)} | Categorical features: {len(categorical_features)}")

# Split
if task_type == "classification":
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
    )
else:
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=RANDOM_STATE
    )


Dropping 14 columns from removed_columns_stage2.json
Numeric features: 8 | Categorical features: 23
Numeric features: 8 | Categorical features: 23


In [9]:
# Preprocessing Pipelines
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])


categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])


preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

In [None]:
# Baseline and core model families
results = []


if task_type == "classification":
    baseline = Pipeline(steps=[("preprocessor", preprocessor), ("clf", DummyClassifier(strategy="most_frequent", random_state=RANDOM_STATE))])
    baseline.fit(X_train, y_train)
    y_pred = baseline.predict(X_test)
    metrics = compute_classification_metrics(y_test, y_pred)
    results.append({"model": "Baseline(DummyClassifier)", **metrics, "fit_time": None, "predict_time": None})


    # Decision Tree
    dt = Pipeline(steps=[("preprocessor", preprocessor), ("clf", DecisionTreeClassifier(random_state=RANDOM_STATE))])
    start = time.perf_counter(); dt.fit(X_train, y_train); fit_t = time.perf_counter()-start
    start = time.perf_counter(); y_pred = dt.predict(X_test); pred_t = time.perf_counter()-start
    metrics = compute_classification_metrics(y_test, y_pred)
    results.append({"model": "DecisionTreeClassifier", **metrics, "fit_time": fit_t, "predict_time": pred_t})


    # KNN
    knn = Pipeline(steps=[("preprocessor", preprocessor), ("clf", KNeighborsClassifier())])
    start = time.perf_counter(); knn.fit(X_train, y_train); fit_t = time.perf_counter()-start
    start = time.perf_counter(); y_pred = knn.predict(X_test); pred_t = time.perf_counter()-start
    metrics = compute_classification_metrics(y_test, y_pred)
    results.append({"model": "KNeighborsClassifier", **metrics, "fit_time": fit_t, "predict_time": pred_t})


    # Naive Bayes
    nb = Pipeline(steps=[("preprocessor", preprocessor), ("clf", GaussianNB())])
    nb.fit(X_train, y_train)
    y_pred = nb.predict(X_test)
    metrics = compute_classification_metrics(y_test, y_pred)
    results.append({"model": "GaussianNB", **metrics, "fit_time": None, "predict_time": None})


    # Logistic Regression
    logr = Pipeline(steps=[("preprocessor", preprocessor), ("clf", LogisticRegression(max_iter=1000, random_state=RANDOM_STATE))])
    start = time.perf_counter(); logr.fit(X_train, y_train); fit_t = time.perf_counter()-start
    start = time.perf_counter(); y_pred = logr.predict(X_test); pred_t = time.perf_counter()-start
    metrics = compute_classification_metrics(y_test, y_pred)
    results.append({"model": "LogisticRegression", **metrics, "fit_time": fit_t, "predict_time": pred_t})


    # SVM
    svm_model = Pipeline(steps=[("preprocessor", preprocessor), ("clf", SVC(probability=True, random_state=RANDOM_STATE))])
    start = time.perf_counter(); svm_model.fit(X_train, y_train); fit_t = time.perf_counter()-start
    start = time.perf_counter(); y_pred = svm_model.predict(X_test); pred_t = time.perf_counter()-start
    metrics = compute_classification_metrics(y_test, y_pred)
    results.append({"model": "SVC", **metrics, "fit_time": fit_t, "predict_time": pred_t})


    # Random Forest
    rf = Pipeline(steps=[("preprocessor", preprocessor), ("clf", RandomForestClassifier(n_estimators=200, random_state=RANDOM_STATE))])
    start = time.perf_counter(); rf.fit(X_train, y_train); fit_t = time.perf_counter()-start
    start = time.perf_counter(); y_pred = rf.predict(X_test); pred_t = time.perf_counter()-start
    metrics = compute_classification_metrics(y_test, y_pred)
    results.append({"model": "RandomForestClassifier", **metrics, "fit_time": fit_t, "predict_time": pred_t})


else:
    baseline = Pipeline(steps=[("preprocessor", preprocessor), ("reg", DummyRegressor(strategy="mean"))])
    baseline.fit(X_train, y_train)
    y_pred = baseline.predict(X_test)
    metrics = compute_regression_metrics(y_test, y_pred)
    results.append({"model": "Baseline(DummyRegressor)", **metrics, "fit_time": None, "predict_time": None})


    # Decision Tree
    dt = Pipeline(steps=[("preprocessor", preprocessor), ("reg", DecisionTreeRegressor(random_state=RANDOM_STATE))])
    start = time.perf_counter(); dt.fit(X_train, y_train); fit_t = time.perf_counter()-start
    start = time.perf_counter(); y_pred = dt.predict(X_test); pred_t = time.perf_counter()-start
    metrics = compute_regression_metrics(y_test, y_pred)
    results.append({"model": "DecisionTreeRegressor", **metrics, "fit_time": fit_t, "predict_time": pred_t})


    # KNN
    knn = Pipeline(steps=[("preprocessor", preprocessor), ("reg", KNeighborsRegressor())])
    start = time.perf_counter(); knn.fit(X_train, y_train); fit_t = time.perf_counter()-start
    start = time.perf_counter(); y_pred = knn.predict(X_test); pred_t = time.perf_counter()-start
    metrics = compute_regression_metrics(y_test, y_pred)
    results.append({"model": "KNeighborsRegressor", **metrics, "fit_time": fit_t, "predict_time": pred_t})


    # Linear Regression
    lr = Pipeline(steps=[("preprocessor", preprocessor), ("reg", LinearRegression())])
    start = time.perf_counter(); lr.fit(X_train, y_train); fit_t = time.perf_counter()-start
    start = time.perf_counter(); y_pred = lr.predict(X_test); pred_t = time.perf_counter()-start
    metrics = compute_regression_metrics(y_test, y_pred)
    results.append({"model": "LinearRegression", **metrics, "fit_time": fit_t, "predict_time": pred_t})


    # SVR
    svm_model = Pipeline(steps=[("preprocessor", preprocessor), ("reg", SVR())])
    start = time.perf_counter(); svm_model.fit(X_train, y_train); fit_t = time.perf_counter()-start
    start = time.perf_counter(); y_pred = svm_model.predict(X_test); pred_t = time.perf_counter()-start
    metrics = compute_regression_metrics(y_test, y_pred)
    results.append({"model": "SVR", **metrics, "fit_time": fit_t, "predict_time": pred_t})


    # Random Forest
    rf = Pipeline(steps=[("preprocessor", preprocessor), ("reg", RandomForestRegressor(n_estimators=200, random_state=RANDOM_STATE))])
    start = time.perf_counter(); rf.fit(X_train, y_train); fit_t = time.perf_counter()-start
    start = time.perf_counter(); y_pred = rf.predict(X_test); pred_t = time.perf_counter()-start
    metrics = compute_regression_metrics(y_test, y_pred)
    results.append({"model": "RandomForestRegressor", **metrics, "fit_time": fit_t, "predict_time": pred_t})

In [None]:
# Cross-Validation and Grid Search
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE) if task_type == "classification" else KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)


if task_type == "classification":
    param_grids = {
        "DecisionTreeClassifier": {"clf__max_depth": [None, 5, 10], "clf__min_samples_split": [2, 5, 10]},
        "KNeighborsClassifier": {"clf__n_neighbors": [5, 11, 21], "clf__weights": ["uniform", "distance"]},
        "LogisticRegression": {"clf__C": [0.1, 1.0, 10.0], "clf__penalty": ["l2"], "clf__solver": ["lbfgs", "saga"]},
        "SVC": {"clf__kernel": ["rbf", "linear"], "clf__C": [0.5, 1, 2], "clf__gamma": ["scale", "auto"]},
        "RandomForestClassifier": {"clf__n_estimators": [100, 200], "clf__max_depth": [None, 10, 20], "clf__max_features": ["sqrt", "log2"]},
    }
    base_models = {
        "DecisionTreeClassifier": Pipeline([("preprocessor", preprocessor), ("clf", DecisionTreeClassifier(random_state=RANDOM_STATE))]),
        "KNeighborsClassifier": Pipeline([("preprocessor", preprocessor), ("clf", KNeighborsClassifier())]),
        "LogisticRegression": Pipeline([("preprocessor", preprocessor), ("clf", LogisticRegression(max_iter=1000, random_state=RANDOM_STATE))]),
        "SVC": Pipeline([("preprocessor", preprocessor), ("clf", SVC(probability=True, random_state=RANDOM_STATE))]),
        "RandomForestClassifier": Pipeline([("preprocessor", preprocessor), ("clf", RandomForestClassifier(random_state=RANDOM_STATE))]),
    }
    scoring = classification_scorers["f1"]
else:
    param_grids = {
        "DecisionTreeRegressor": {"reg__max_depth": [None, 5, 10], "reg__min_samples_split": [2, 5, 10]},
        "KNeighborsRegressor": {"reg__n_neighbors": [5, 11, 21], "reg__weights": ["uniform", "distance"]},
        "SVR": {"reg__kernel": ["rbf", "linear"], "reg__C": [0.5, 1, 2], "reg__gamma": ["scale", "auto"]},
        "RandomForestRegressor": {"reg__n_estimators": [100, 200], "reg__max_depth": [None, 10, 20], "reg__max_features": ["sqrt", "log2"]},
        "LinearRegression": {},
    }
    base_models = {
        "DecisionTreeRegressor": Pipeline([("preprocessor", preprocessor), ("reg", DecisionTreeRegressor(random_state=RANDOM_STATE))]),
        "KNeighborsRegressor": Pipeline([("preprocessor", preprocessor), ("reg", KNeighborsRegressor())]),
        "SVR": Pipeline([("preprocessor", preprocessor), ("reg", SVR())]),
        "RandomForestRegressor": Pipeline([("preprocessor", preprocessor), ("reg", RandomForestRegressor(random_state=RANDOM_STATE))]),
        "LinearRegression": Pipeline([("preprocessor", preprocessor), ("reg", LinearRegression())]),
    }
    scoring = regression_scorers["rmse"]


best_models = []
for name, model in base_models.items():
    grid = param_grids.get(name, {})
    gs = GridSearchCV(model, grid, cv=cv, scoring=scoring, n_jobs=-1)
    gs.fit(X_train, y_train)
    best_models.append({
        "name": name,
        "best_estimator": gs.best_estimator_,
        "best_params": gs.best_params_,
        "best_score_cv": gs.best_score_,
    })


best_models_df = pd.DataFrame([{ "model": bm["name"], "best_score_cv": bm["best_score_cv"], "best_params": bm["best_params"] } for bm in best_models])
best_models_df.head()

In [None]:
# Leaderboard, plots, and save best estimator
comp_df = pd.DataFrame(results)


plt.figure(figsize=(10,5))
if task_type == "classification":
    sns.barplot(data=comp_df, x="model", y="f1")
    plt.ylabel("F1 Score")
else:
    sns.barplot(data=comp_df, x="model", y="rmse")
    plt.ylabel("RMSE (lower is better)")
plt.xticks(rotation=45, ha="right")
plt.title("Primary Metric by Model")
plt.tight_layout()


# Rank and justification
if task_type == "classification":
    ranked = comp_df.sort_values(by="f1", ascending=False)
    best_row = ranked.iloc[0]
    justification = (
        f"Selected {best_row['model']} for highest F1 ({best_row['f1']:.3f}). "
        f"Compared to baseline accuracy {comp_df.loc[comp_df['model']=='Baseline(DummyClassifier)','accuracy'].max():.3f} where available. "
        f"Balance of performance, interpretability, and efficiency.")
else:
    ranked = comp_df.sort_values(by="rmse", ascending=True)
    best_row = ranked.iloc[0]
    justification = (
        f"Selected {best_row['model']} for lowest RMSE ({best_row['rmse']:.3f}) and strong R2 ({best_row.get('r2', np.nan):.3f}). "
        f"Improves over baseline.")


print(justification)


import os
from joblib import dump
os.makedirs("outputs", exist_ok=True)
comp_df.to_csv(os.path.join("outputs", "model_leaderboard.csv"), index=False)
ranked.to_csv(os.path.join("outputs", "model_leaderboard_ranked.csv"), index=False)


# Save best estimator from grid search
if len(best_models) > 0:
    if task_type == "classification":
        best_from_grid = max(best_models, key=lambda bm: bm["best_score_cv"])
    else:
        best_from_grid = min(best_models, key=lambda bm: bm["best_score_cv"])
    best_est = best_from_grid["best_estimator"]
    best_est.fit(X_train, y_train)
    dump(best_est, os.path.join("outputs", "best_model.pkl"))
    print("Saved best tuned model:", best_from_grid["name"], best_from_grid["best_params"])


# Save metrics summary
summary = {"task_type": task_type, "primary_metric": primary_metric}
with open(os.path.join("outputs", "metrics_summary.json"), "w") as f:
    json.dump(summary, f, indent=2)