In [None]:
# ============================================================
# 02_model_training.ipynb — Minimal Setup + Train/Evaluate Models
# Goal: Train baseline + tree model using a leakage-safe Pipeline
#       and evaluate with MAERMSE, R².
#       (baseline + RF _ leakage _ error analysis)
# ============================================================
import warnings # supress warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)  # numpy runtime warnings
warnings.filterwarnings("ignore", category=UserWarning)     # sklearn imputer warnings


import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GroupShuffleSplit
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GroupShuffleSplit
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

RANDOM_STATE = 42
TARGET = "range"

# ---- Load data ----
df = pd.read_csv("../data/raw/vehicles.csv", low_memory=False)
df = df[df["fuelType1"] == "Electricity"].copy() # EV only filter

# dataset must be EV-only
fuel_types = df["fuelType1"].unique()
assert len(fuel_types) == 1 and fuel_types[0] == "Electricity", (
    f"Sanity check failed: non-EV fuel types found: {fuel_types}")


# EVs should not exist before ~1997 (Check)
assert df["year"].min() >= 1997, (
    f"Sanity check failed: suspicious EV year detected: {df['year'].min()}")
df = df[df["year"] >= 2011].copy()
df = df.dropna(subset=[TARGET])
print("Rows:", df.shape[0], "| Columns:", df.shape[1])

# ------------------------------------------------------------
# Sanity checks: EV-only dataset validation
# ------------------------------------------------------------
# from IPython.display import display
#print("EV year range:", df["year"].min(), "→", df["year"].max())
#print("Fuel types remaining:", df["fuelType1"].unique())
#print("\nEV counts by year (last 15 years shown):")
#display(df["year"].value_counts().sort_index().tail(15))
# --------------------------------------------------

# Build X/y
X = df.drop(columns=[TARGET])
y = df[TARGET]

# Group by vehicle identity
GROUP_COLS = ["make", "model", "year"]  # if your column names differ, change here

missing = [c for c in GROUP_COLS if c not in df.columns]
if missing:
    raise KeyError(f"Missing grouping columns: {missing}. Run df.columns and update GROUP_COLS.")

groups = df[GROUP_COLS].astype(str).agg("_".join, axis=1)

gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=RANDOM_STATE)
train_idx, test_idx = next(gss.split(X, y, groups=groups))

X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

print("Train:", X_train.shape, "Test:", X_test.shape)
print("Unique groups - Train:", groups.iloc[train_idx].nunique(), "| Test:", groups.iloc[test_idx].nunique())

# Check to ensure no group overlap between train and test
train_groups = set(groups.iloc[train_idx])
test_groups = set(groups.iloc[test_idx])
overlap = train_groups.intersection(test_groups)

print("Group overlap count:", len(overlap))
assert len(overlap) == 0, "Leakage: same make/model/year appears in both train and test."

# ---- Column types ----
num_cols = X_train.select_dtypes(include=["int64", "float64"]).columns
cat_cols = X_train.select_dtypes(include=["object", "category", "bool"]).columns

# ---- Preprocessing ----
numeric_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median"))])

categorical_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="Unknown")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_pipe, num_cols),
        ("cat", categorical_pipe, cat_cols),
    ],
    remainder="drop")

print("Numeric cols:", len(num_cols), "| Categorical cols:", len(cat_cols))
# ---- Eval helper ----
def eval_regression(name, y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)

    print(name)
    print(f"  MAE : {mae:.2f}")
    print(f"  RMSE: {rmse:.2f}")
    print(f"  R^2 : {r2:.3f}")
    return {"model": name, "mae": mae, "rmse": rmse, "r2": r2}

# ---- Models ----
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(
        n_estimators=300,
        random_state=RANDOM_STATE,
        n_jobs=-1
    )
}

# ---- Train + evaluate ----
results = []
pipes = {}

for model_name, model in models.items():
    pipe = Pipeline([
        ("preprocess", preprocess),
        ("model", model)
    ])
    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_test)

    pipes[model_name] = pipe   # <-- THIS creates pipes["Random Forest"]
    results.append(eval_regression(model_name, y_test, preds))

results_df = pd.DataFrame(results).sort_values("rmse")
results_df

# Pull feature names after preprocessing (works with sklearn >= 1.0)
feature_names = pipes["Random Forest"] \
    .named_steps["preprocess"] \
    .get_feature_names_out()

rf_model = pipes["Random Forest"].named_steps["model"]

importances = rf_model.feature_importances_

imp_df = pd.DataFrame({
    "feature": feature_names,
    "importance": importances
}).sort_values("importance", ascending=False)

imp_df.head(30)

# 1) Start with keyword-based suspects in ORIGINAL X columns
suspect_patterns = [
    "range", "rng", "mile", "miles",
    "mpge", "kwh", "kwh/100", "kwh_100",
    "electric", "charge", "battery", "capacity", "fuelE"
]

name_suspects = [c for c in X.columns if any(p in c.lower() for p in suspect_patterns)]

# 2) Add ultra-high-correlation numeric suspects (ORIGINAL X columns)
num_cols_all = X.select_dtypes(include=["int64", "float64"]).columns
corrs = X[num_cols_all].corrwith(y).abs().sort_values(ascending=False)
corr_suspects = list(corrs[corrs > 0.98].index)

to_drop = sorted(set(name_suspects + corr_suspects))

print(f"Dropping {len(to_drop)} suspect columns:")
print(to_drop[:80], "..." if len(to_drop) > 80 else "")

# 3) Rebuild X/y with drops and rerun the exact same pipeline training
X2 = df.drop(columns=[TARGET]).drop(columns=to_drop, errors="ignore")
y2 = df[TARGET]

X2_train, X2_test, y2_train, y2_test = train_test_split(
    X2, y2, test_size=0.2, random_state=RANDOM_STATE
)

num_cols2 = X2_train.select_dtypes(include=["int64", "float64"]).columns
cat_cols2 = X2_train.select_dtypes(include=["object", "category", "bool", "string"]).columns

preprocess2 = ColumnTransformer(
    transformers=[
        ("num", Pipeline([("imputer", SimpleImputer(strategy="median"))]), num_cols2),
        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="constant", fill_value="Unknown")),
            ("onehot", OneHotEncoder(handle_unknown="ignore"))
        ]), cat_cols2)
    ],
    remainder="drop"
)

models2 = {
    "Linear Regression (no suspects)": LinearRegression(),
    "Random Forest (no suspects)": RandomForestRegressor(
        n_estimators=300, random_state=RANDOM_STATE, n_jobs=-1
    )
}

results2 = []
for name, model in models2.items():
    pipe = Pipeline([("preprocess", preprocess2), ("model", model)])
    pipe.fit(X2_train, y2_train)
    preds = pipe.predict(X2_test)
    results2.append(eval_regression(name, y2_test, preds))

pd.DataFrame(results2).sort_values("rmse")

# Select the best-performing model from the previous evaluation
BEST_MODEL_NAME = "Random Forest (no suspects)"

# Rebuild and fit the pipeline on the training split
best_pipe = Pipeline([
    ("preprocess", preprocess2),
    ("model", models2[BEST_MODEL_NAME])
])
best_pipe.fit(X2_train, y2_train)

# Generate predictions on the test set
preds = best_pipe.predict(X2_test)

# Create a readable dataframe linking predictions to vehicle identity
pred_df = X2_test[["make", "model", "year"]].copy()
pred_df["y_true"] = y2_test.values
pred_df["y_pred"] = preds


#-------------------------------------
#         Error Analysis
#------------------------------------
# Absolute error highlights where the model performs worst
pred_df["abs_error"] = (pred_df["y_true"] - pred_df["y_pred"]).abs()

# Display vehicles with the largest prediction errors
pred_df.sort_values("abs_error", ascending=False).head(20)

# brand level summary
brand_year_summary = (
    pred_df
    .groupby(["make", "year"])
    .agg(
        mean_abs_error=("abs_error", "mean"),
        median_abs_error=("abs_error", "median"),
        vehicle_count=("abs_error", "count")
    )
)

# Filter out unreliable brand-year groups (low sample size)
brand_year_summary = brand_year_summary[
    brand_year_summary["vehicle_count"] >= 2]

# Sort by typical (median) error
brand_year_summary = brand_year_summary.sort_values(
    "median_abs_error", ascending=False
)

brand_year_summary.head(15)

# Provides a deeper breakdown to identify specific vehicle & Filtered to retain only reliable groups.
model_error_summary = (
    pred_df
    .groupby(["make", "model"])
    .agg(
        min_year=("year", "min"),
        max_year=("year", "max"),
        mean_abs_error=("abs_error", "mean"),
        median_abs_error=("abs_error", "median"),
        vehicle_count=("abs_error", "count"),
    )
    .sort_values("median_abs_error", ascending=False)
)

model_error_summary["support_level"] = np.where(
    model_error_summary["vehicle_count"] >= 2,
    "reliable",
    "low_support"
)

model_error_summary.head(15)

In [None]:
from pathlib import Path

# Resolve project paths
PROJECT_ROOT = Path("..")  # notebooks/ → project root
DATA_PROCESSED = PROJECT_ROOT / "data" / "processed"
DATA_PROCESSED.mkdir(parents=True, exist_ok=True)

# Save test-set prediction dataframe
pred_df.to_csv(
    DATA_PROCESSED / "pred_df_test.csv",
    index=False
)

# Save brand-year error summary
brand_year_summary.to_csv(
    DATA_PROCESSED / "brand_year_error_summary.csv",
    index=False
)

# Save model-level error summary
model_error_summary.to_csv(
    DATA_PROCESSED / "model_error_summary.csv",
    index=False
)

print("Saved processed artifacts to:", DATA_PROCESSED.resolve())

In [None]:
# ============================================
# SAVE TRAINED MODEL + TEST DATA FOR INFERENCE
# ============================================

from joblib import dump

MODELS_DIR = Path("../models")
DATA_PROCESSED = Path("../data/processed")

MODELS_DIR.mkdir(parents=True, exist_ok=True)
DATA_PROCESSED.mkdir(parents=True, exist_ok=True)

# Save model
dump(rf_model, MODELS_DIR / "ev_range_model.joblib")

# Save test data
X_test.to_csv(DATA_PROCESSED / "X_test.csv", index=False)
y_test.to_csv(DATA_PROCESSED / "y_test.csv", index=False)

print("Model and test data saved successfully.")

In [None]:
# ============================================
# FINAL — Save full preprocessing + model Pipeline
# Goal: Persist the *entire* sklearn Pipeline (preprocess + model)
# so downstream inference never sees raw categorical strings.
# ============================================

import joblib
from pathlib import Path

# Ensure models directory exists
Path("models").mkdir(parents=True, exist_ok=True)

# Save the FULL pipeline (not the bare estimator)
joblib.dump(pipes["Random Forest"], "models/ev_range_model.joblib")

print("✅ Saved full Pipeline to models/ev_range_model.joblib")
print("Saved object type:", type(pipes["Random Forest"]))

In [None]:
# ============================================
# FINAL — Overwrite model artifact with FULL Pipeline (preprocess + model)
# ============================================

import joblib
from pathlib import Path
from sklearn.pipeline import Pipeline

MODEL_PATH = Path("models/ev_range_model.joblib")
MODEL_PATH.parent.mkdir(parents=True, exist_ok=True)

# 1) Confirm we actually have the pipeline in memory
print("pipes keys:", list(pipes.keys()))
print("pipes['Random Forest'] type:", type(pipes["Random Forest"]))

# 2) Hard check: this MUST be a sklearn Pipeline
if not isinstance(pipes["Random Forest"], Pipeline):
    raise TypeError("pipes['Random Forest'] is not a sklearn Pipeline. Do NOT save.")

# 3) Overwrite the joblib with the full pipeline
joblib.dump(pipes["Random Forest"], MODEL_PATH)

# 4) Verify what is actually saved on disk
reloaded = joblib.load(MODEL_PATH)
print("✅ Saved model artifact type on disk:", type(reloaded))
print("✅ Saved steps:", reloaded.named_steps.keys())

In [None]:
# ============================================
# FINAL — Save FULL Pipeline to project-root /models
# ============================================

import joblib
from pathlib import Path
from sklearn.pipeline import Pipeline

# Notebook 02 runs from /notebooks, so project root is parent
PROJECT_ROOT = Path.cwd() if (Path.cwd() / "data").exists() else Path.cwd().parent
MODEL_PATH = PROJECT_ROOT / "models" / "ev_range_model.joblib"
MODEL_PATH.parent.mkdir(parents=True, exist_ok=True)

print("CWD:", Path.cwd())
print("PROJECT_ROOT:", PROJECT_ROOT)
print("Saving to:", MODEL_PATH)

# Hard check: must be a Pipeline
if not isinstance(pipes["Random Forest"], Pipeline):
    raise TypeError("pipes['Random Forest'] is not a sklearn Pipeline — do not save.")

joblib.dump(pipes["Random Forest"], MODEL_PATH)

# Verify what was actually written
reloaded = joblib.load(MODEL_PATH)
print("✅ On-disk type:", type(reloaded))