In [1]:
from sklearn.ensemble import HistGradientBoostingRegressor
import joblib
import numpy as np
import pandas as pd
import time
import os
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [2]:

if os.path.isdir("/data"):
    DATA_BASE = "/data"
else:
    DATA_BASE = "../data"

PROCESSED_DIR = os.path.join(DATA_BASE, "processed")
MODELS_DIR    = "../models"
OUTPUT_DIR    = "../output"
os.makedirs(MODELS_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)

X_train = pd.read_parquet(os.path.join(PROCESSED_DIR, "X_train.parquet"))
X_test  = pd.read_parquet(os.path.join(PROCESSED_DIR, "X_test.parquet"))
y_train = pd.read_parquet(os.path.join(PROCESSED_DIR, "y_train.parquet"))["fare_amount"]
y_test  = pd.read_parquet(os.path.join(PROCESSED_DIR, "y_test.parquet"))["fare_amount"]

# -------- model ----------
model = HistGradientBoostingRegressor(
    max_depth=10,
    learning_rate=0.05,
    max_iter=200,          
    min_samples_leaf=20,
    l2_regularization=0.0,
    random_state=42
)

start = time.time()
model.fit(X_train, y_train)
train_time = time.time() - start
print(f"Training time: {train_time:.2f} s")

# -------- predictions ----------
y_pred_train = model.predict(X_train)
y_pred_test  = model.predict(X_test)

def mape(y_true, y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    mask = y_true != 0
    return np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])) * 100

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

metrics_train = {
    "RMSE": rmse(y_train, y_pred_train),
    "MAE": mean_absolute_error(y_train, y_pred_train),
    "MAPE": mape(y_train, y_pred_train),
    "R2": r2_score(y_train, y_pred_train),
}
metrics_test = {
    "RMSE": rmse(y_test, y_pred_test),
    "MAE": mean_absolute_error(y_test, y_pred_test),
    "MAPE": mape(y_test, y_pred_test),
    "R2": r2_score(y_test, y_pred_test),
}
print("Train:", metrics_train)
print("Test :", metrics_test)


if hasattr(model, "feature_importances_"):
    importances = model.feature_importances_
    feat_df = pd.DataFrame({
        "feature": X_train.columns,
        "importance": importances
    }).sort_values("importance", ascending=False)
    feat_df.to_csv(os.path.join(OUTPUT_DIR, "hgb_feature_importance.csv"), index=False)

joblib.dump(model, os.path.join(MODELS_DIR, "hist_gbm_model.pkl"))
print("Saved HistGradientBoosting model.")


Training time: 16.12 s
Train: {'RMSE': np.float64(2.2300878844476024), 'MAE': 0.6236305825268245, 'MAPE': np.float64(16.260962466213662), 'R2': 0.984128802872261}
Test : {'RMSE': np.float64(2.3180818674214603), 'MAE': 0.6363799012975981, 'MAPE': np.float64(12.734552732981609), 'R2': 0.9789624684290328}
Saved HistGradientBoosting model.
