In [1]:
import math

import numpy as np
import pandas as pd
from sklearn.base import clone
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import ParameterSampler
from xgboost import XGBRegressor


In [2]:
encoded_path = "../../datasets/processed/enriched_sample_encoded.csv"
raw_path = "../../datasets/raw/enriched_sample.csv"

encoded_df = pd.read_csv(encoded_path)
dates = pd.read_csv(raw_path, usecols=["Date"])
encoded_df["Date"] = pd.to_datetime(dates["Date"])
encoded_df.head()


Unnamed: 0,Traffic Volume,Average Speed,Travel Time Index,Congestion Level,Incident Reports,Public Transport Usage,Traffic Signal Compliance,Parking Usage,Pedestrian and Cyclist Count,Roadwork and Construction Activity,...,Road/Intersection Name_Jayanagar 4th Block,Road/Intersection Name_Marathahalli Bridge,Road/Intersection Name_Sarjapur Road,Road/Intersection Name_Silk Board Junction,Road/Intersection Name_Sony World Junction,Road/Intersection Name_South End Circle,Road/Intersection Name_Trinity Circle,Road/Intersection Name_Tumkur Road,Road/Intersection Name_Yeshwanthpur Circle,Date
0,50590,50.230299,1.5,100.0,0,70.63233,84.0446,85.403629,111,0,...,0,0,0,0,0,0,0,0,0,2022-01-01
1,30825,29.377125,1.5,100.0,1,41.924899,91.407038,59.983689,100,0,...,0,0,0,0,0,0,0,0,0,2022-01-01
2,7399,54.474398,1.039069,28.347994,0,44.662384,61.375541,95.46602,189,0,...,0,1,0,0,0,0,0,0,0,2022-01-01
3,60874,43.81761,1.5,100.0,1,32.773123,75.547092,63.567452,111,0,...,0,0,0,0,1,0,0,0,0,2022-01-01
4,57292,41.116763,1.5,100.0,3,35.092601,64.634762,93.155171,104,0,...,0,0,1,0,0,0,0,0,0,2022-01-01


In [3]:
year_counts = encoded_df["Date"].dt.year.value_counts().sort_index()
year_counts


Date
2022    3424
2023    3413
2024    2099
Name: count, dtype: int64

In [4]:
target = encoded_df["Traffic Volume"]
features = encoded_df.drop(columns=["Traffic Volume"])

train_mask = encoded_df["Date"].dt.year == 2022
val_mask = encoded_df["Date"].dt.year == 2023
test_mask = encoded_df["Date"].dt.year == 2024

X_train = features.loc[train_mask].drop(columns=["Date"])
X_val = features.loc[val_mask].drop(columns=["Date"])
X_test = features.loc[test_mask].drop(columns=["Date"])

y_train = target.loc[train_mask]
y_val = target.loc[val_mask]
y_test = target.loc[test_mask]

X_train.shape, X_val.shape, X_test.shape


((3424, 54), (3413, 54), (2099, 54))

In [5]:
def rmse(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    return math.sqrt(mse)

rf_base = RandomForestRegressor(random_state=42, n_jobs=-1)
xgb_base = XGBRegressor(
    random_state=42,
    tree_method="hist",
    objective="reg:squarederror",
    n_jobs=-1,
    eval_metric="rmse",
)

param_dist = {
    "rf_n_estimators": [200, 300, 400, 500, 600],
    "rf_max_depth": [10, 12, 15, None],
    "rf_min_samples_split": [2, 5, 10],
    "rf_min_samples_leaf": [1, 2, 4],
    "rf_max_features": ["sqrt", "log2", 0.8],
    "xgb_n_estimators": [300, 400, 500, 600],
    "xgb_learning_rate": [0.05, 0.1, 0.2],
    "xgb_max_depth": [6, 8, 10],
    "xgb_subsample": [0.8, 0.9, 1.0],
    "xgb_colsample_bytree": [0.8, 0.9, 1.0],
    "xgb_reg_lambda": [0.8, 1.0, 1.5],
}

n_iter = 15
search_results = []

for params in ParameterSampler(param_dist, n_iter=n_iter, random_state=42):
    rf_params = {
        "n_estimators": int(params["rf_n_estimators"]),
        "max_depth": (
            int(params["rf_max_depth"]) if params["rf_max_depth"] is not None else None
        ),
        "min_samples_split": int(params["rf_min_samples_split"]),
        "min_samples_leaf": int(params["rf_min_samples_leaf"]),
        "max_features": params["rf_max_features"],
    }
    xgb_params = {
        "n_estimators": int(params["xgb_n_estimators"]),
        "learning_rate": float(params["xgb_learning_rate"]),
        "max_depth": int(params["xgb_max_depth"]),
        "subsample": float(params["xgb_subsample"]),
        "colsample_bytree": float(params["xgb_colsample_bytree"]),
        "reg_lambda": float(params["xgb_reg_lambda"]),
    }

    rf_model = clone(rf_base).set_params(**rf_params)
    xgb_model = clone(xgb_base).set_params(**xgb_params)

    rf_model.fit(X_train, y_train)
    xgb_model.fit(X_train, y_train)

    train_rf_pred = rf_model.predict(X_train)
    val_rf_pred = rf_model.predict(X_val)
    train_xgb_pred = xgb_model.predict(X_train)
    val_xgb_pred = xgb_model.predict(X_val)

    train_pred = (train_rf_pred + train_xgb_pred) / 2
    val_pred = (val_rf_pred + val_xgb_pred) / 2

    search_results.append(
        {
            "rf_n_estimators": rf_params["n_estimators"],
            "rf_max_depth": rf_params["max_depth"],
            "rf_min_samples_split": rf_params["min_samples_split"],
            "rf_min_samples_leaf": rf_params["min_samples_leaf"],
            "rf_max_features": rf_params["max_features"],
            "xgb_n_estimators": xgb_params["n_estimators"],
            "xgb_learning_rate": xgb_params["learning_rate"],
            "xgb_max_depth": xgb_params["max_depth"],
            "xgb_subsample": xgb_params["subsample"],
            "xgb_colsample_bytree": xgb_params["colsample_bytree"],
            "xgb_reg_lambda": xgb_params["reg_lambda"],
            "train_rmse_rf": rmse(y_train, train_rf_pred),
            "train_rmse_xgb": rmse(y_train, train_xgb_pred),
            "train_rmse_ensemble": rmse(y_train, train_pred),
            "val_rmse_rf": rmse(y_val, val_rf_pred),
            "val_rmse_xgb": rmse(y_val, val_xgb_pred),
            "val_rmse_ensemble": rmse(y_val, val_pred),
        }
    )


search_df = pd.DataFrame(search_results).sort_values("val_rmse_ensemble", ascending=True).reset_index(drop=True)
search_df


Unnamed: 0,rf_n_estimators,rf_max_depth,rf_min_samples_split,rf_min_samples_leaf,rf_max_features,xgb_n_estimators,xgb_learning_rate,xgb_max_depth,xgb_subsample,xgb_colsample_bytree,xgb_reg_lambda,train_rmse_rf,train_rmse_xgb,train_rmse_ensemble,val_rmse_rf,val_rmse_xgb,val_rmse_ensemble
0,200,10.0,2,1,0.8,500,0.05,6,0.8,0.8,0.8,3418.063614,1170.01688,2225.450705,5235.436008,5515.617826,5318.187859
1,300,15.0,2,2,sqrt,600,0.1,8,0.9,0.8,1.0,3461.321156,7.206911,1732.479105,5275.899065,5518.565756,5320.52055
2,200,12.0,2,1,sqrt,600,0.05,8,0.9,1.0,0.8,3898.368343,112.612951,1986.365611,5309.042773,5562.027508,5336.557556
3,200,10.0,5,4,0.8,500,0.1,6,1.0,0.9,1.5,3688.069846,589.917632,2068.311459,5268.845839,5606.944266,5362.814182
4,300,10.0,5,2,log2,500,0.05,6,1.0,1.0,0.8,4861.337776,1419.307049,3008.054816,5525.735593,5559.723374,5365.366997
5,300,15.0,2,2,0.8,300,0.05,8,0.8,0.9,1.5,2515.202074,724.089212,1565.860167,5303.915202,5514.927923,5366.708922
6,400,10.0,10,4,sqrt,400,0.2,8,1.0,0.8,1.0,4732.897326,3.749141,2367.229149,5328.767546,5684.759626,5369.918728
7,500,10.0,5,2,0.8,300,0.1,6,0.8,0.9,0.8,3499.982349,913.12903,2128.069906,5259.06923,5629.839252,5371.221746
8,300,12.0,5,1,sqrt,600,0.2,8,0.8,0.8,1.5,4020.40027,0.025448,2010.203788,5304.654512,5701.755344,5373.172439
9,300,12.0,10,4,0.8,400,0.2,10,1.0,0.9,1.0,3440.585737,0.010918,1720.29489,5273.283301,5648.679846,5375.063697


In [6]:
best_row = search_df.iloc[0]
best_rf_params = {
    "n_estimators": int(best_row["rf_n_estimators"]),
    "max_depth": (
        int(best_row["rf_max_depth"]) if not pd.isna(best_row["rf_max_depth"]) else None
    ),
    "min_samples_split": int(best_row["rf_min_samples_split"]),
    "min_samples_leaf": int(best_row["rf_min_samples_leaf"]),
    "max_features": best_row["rf_max_features"],
}
best_xgb_params = {
    "n_estimators": int(best_row["xgb_n_estimators"]),
    "learning_rate": float(best_row["xgb_learning_rate"]),
    "max_depth": int(best_row["xgb_max_depth"]),
    "subsample": float(best_row["xgb_subsample"]),
    "colsample_bytree": float(best_row["xgb_colsample_bytree"]),
    "reg_lambda": float(best_row["xgb_reg_lambda"]),
}

X_train_val = pd.concat([X_train, X_val], axis=0)
y_train_val = pd.concat([y_train, y_val], axis=0)

best_rf_model = clone(rf_base).set_params(**best_rf_params)
best_xgb_model = clone(xgb_base).set_params(**best_xgb_params)

best_rf_model.fit(X_train_val, y_train_val)
best_xgb_model.fit(X_train_val, y_train_val)

train_val_pred = (best_rf_model.predict(X_train_val) + best_xgb_model.predict(X_train_val)) / 2
test_pred = (best_rf_model.predict(X_test) + best_xgb_model.predict(X_test)) / 2

summary = pd.DataFrame(
    [
        {
            "rf_n_estimators": best_rf_params["n_estimators"],
            "rf_max_depth": best_rf_params["max_depth"],
            "rf_min_samples_split": best_rf_params["min_samples_split"],
            "rf_min_samples_leaf": best_rf_params["min_samples_leaf"],
            "rf_max_features": best_rf_params["max_features"],
            "xgb_n_estimators": best_xgb_params["n_estimators"],
            "xgb_learning_rate": best_xgb_params["learning_rate"],
            "xgb_max_depth": best_xgb_params["max_depth"],
            "xgb_subsample": best_xgb_params["subsample"],
            "xgb_colsample_bytree": best_xgb_params["colsample_bytree"],
            "xgb_reg_lambda": best_xgb_params["reg_lambda"],
            "train_val_rmse_ensemble": rmse(y_train_val, train_val_pred),
            "test_rmse_ensemble": rmse(y_test, test_pred),
        }
    ]
)
summary


Unnamed: 0,rf_n_estimators,rf_max_depth,rf_min_samples_split,rf_min_samples_leaf,rf_max_features,xgb_n_estimators,xgb_learning_rate,xgb_max_depth,xgb_subsample,xgb_colsample_bytree,xgb_reg_lambda,train_val_rmse_ensemble,test_rmse_ensemble
0,200,10,2,1,0.8,500,0.05,6,0.8,0.8,0.8,2892.280171,4998.279297
