# Modeling
Train regressors and save the best model.

In [None]:

import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import joblib, json

DF_PATH = Path("../data/processed/processed.csv")
df = pd.read_csv(DF_PATH)

target_col = "production"
X = df.drop(columns=[target_col])
y = df[target_col]

cat_cols = X.select_dtypes(include=["object"]).columns.tolist()
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()

pre = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ("num", StandardScaler(), num_cols)
])

models = {
    "linreg": (LinearRegression(), {}),
    "rf": (RandomForestRegressor(random_state=42), {
        "model__n_estimators": [200],
        "model__max_depth": [None, 20]
    }),
    "gbr": (GradientBoostingRegressor(random_state=42), {
        "model__n_estimators": [150],
        "model__learning_rate": [0.05, 0.1],
        "model__max_depth": [2, 3]
    })
}

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

best_model, best_score, best_name = None, -np.inf, ""
for name, (est, grid) in models.items():
    pipe = Pipeline([("pre", pre), ("model", est)])
    if grid:
        gs = GridSearchCV(pipe, grid, scoring="r2", cv=3, n_jobs=-1)
        gs.fit(X_train, y_train)
        model = gs.best_estimator_
    else:
        model = pipe.fit(X_train, y_train)

    preds = model.predict(X_test)
    r2 = r2_score(y_test, preds)
    print(name, "R2:", r2)
    if r2 > best_score:
        best_model, best_score, best_name = model, r2, name

Path("../models").mkdir(parents=True, exist_ok=True)
joblib.dump(best_model, Path("../models/best_model.joblib"))

meta = {"cat_cols": cat_cols, "num_cols": num_cols, "target": target_col}
with open(Path("../models/metadata.json"), "w") as f:
    json.dump(meta, f)
