In [1]:
import math

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import ParameterSampler

In [2]:
encoded_path = "/home/pranav/projects/fods_assignment/datasets/processed/traffic_features_encoded_filtered.csv"
raw_path = "/home/pranav/projects/fods_assignment/datasets/raw/enriched_sample.csv"

encoded_df = pd.read_csv(encoded_path)
dates = pd.read_csv(raw_path, usecols=["Date"])
encoded_df["Date"] = pd.to_datetime(dates["Date"])
encoded_df.head()

Unnamed: 0,Traffic Volume,Average Speed,Travel Time Index,Congestion Level,Incident Reports,Public Transport Usage,Traffic Signal Compliance,Parking Usage,Pedestrian and Cyclist Count,Roadwork and Construction Activity,...,Road/Intersection Name_Jayanagar 4th Block,Road/Intersection Name_Marathahalli Bridge,Road/Intersection Name_Sarjapur Road,Road/Intersection Name_Silk Board Junction,Road/Intersection Name_Sony World Junction,Road/Intersection Name_South End Circle,Road/Intersection Name_Trinity Circle,Road/Intersection Name_Tumkur Road,Road/Intersection Name_Yeshwanthpur Circle,Date
0,50590,50.230299,1.5,100.0,0,70.63233,84.0446,85.403629,111,0,...,0,0,0,0,0,0,0,0,0,2022-01-01
1,30825,29.377125,1.5,100.0,1,41.924899,91.407038,59.983689,100,0,...,0,0,0,0,0,0,0,0,0,2022-01-01
2,7399,54.474398,1.039069,28.347994,0,44.662384,61.375541,95.46602,189,0,...,0,1,0,0,0,0,0,0,0,2022-01-01
3,60874,43.81761,1.5,100.0,1,32.773123,75.547092,63.567452,111,0,...,0,0,0,0,1,0,0,0,0,2022-01-01
4,57292,41.116763,1.5,100.0,3,35.092601,64.634762,93.155171,104,0,...,0,0,1,0,0,0,0,0,0,2022-01-01


In [3]:
year_counts = encoded_df["Date"].dt.year.value_counts().sort_index()
year_counts

Date
2022    3424
2023    3413
2024    2099
Name: count, dtype: int64

In [4]:
target = encoded_df["Traffic Volume"]
features = encoded_df.drop(columns=["Traffic Volume"])

train_mask = encoded_df["Date"].dt.year == 2022
val_mask = encoded_df["Date"].dt.year == 2023
test_mask = encoded_df["Date"].dt.year == 2024

X_train = features.loc[train_mask].drop(columns=["Date"])
X_val = features.loc[val_mask].drop(columns=["Date"])
X_test = features.loc[test_mask].drop(columns=["Date"])

y_train = target.loc[train_mask]
y_val = target.loc[val_mask]
y_test = target.loc[test_mask]

X_train.shape, X_val.shape, X_test.shape

((3424, 43), (3413, 43), (2099, 43))

In [8]:
def rmse(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    return math.sqrt(mse)

param_dist = {
    "n_estimators": [200, 300, 400, 500, 600],
    "max_depth": [None, 10, 15, 20, 30],
    "min_samples_split": [2, 5, 10, 20],
    "min_samples_leaf": [1, 2, 4, 8],
    "max_features": ["sqrt", "log2", 0.8],
}
n_iter = 20
search_results = []

for params in ParameterSampler(param_dist, n_iter=n_iter, random_state=42):
    model = RandomForestRegressor(random_state=42, n_jobs=-1, **params)
    model.fit(X_train, y_train)
    train_rmse = rmse(y_train, model.predict(X_train))
    val_rmse = rmse(y_val, model.predict(X_val))
    search_results.append(
        {
            "n_estimators": params["n_estimators"],
            "max_depth": params["max_depth"],
            "min_samples_split": params["min_samples_split"],
            "min_samples_leaf": params["min_samples_leaf"],
            "max_features": params["max_features"],
            "train_rmse": train_rmse,
            "val_rmse": val_rmse,
        }
    )

search_df = pd.DataFrame(search_results).sort_values("val_rmse", ascending=True).reset_index(drop=True)
search_df

Unnamed: 0,n_estimators,max_depth,min_samples_split,min_samples_leaf,max_features,train_rmse,val_rmse
0,600,30.0,10,1,sqrt,3587.349629,5253.771995
1,500,15.0,2,4,sqrt,4269.171986,5263.245746
2,600,,5,4,sqrt,4066.211103,5268.788617
3,600,,2,4,sqrt,4066.211103,5268.788617
4,500,,20,4,sqrt,4495.349036,5269.453876
5,600,10.0,20,2,0.8,4104.811461,5273.708016
6,500,10.0,2,2,0.8,3514.411827,5283.354329
7,200,20.0,5,4,sqrt,4104.051232,5288.104588
8,500,30.0,20,4,0.8,3921.950569,5292.289779
9,300,10.0,10,1,0.8,3760.595104,5295.148576


In [13]:
best_params = search_df.iloc[0][["n_estimators", "max_depth", "min_samples_split", "min_samples_leaf", "max_features"]].to_dict()
int_keys = ['n_estimators', 'max_depth', 'min_samples_split', 'min_samples_leaf']

for key in int_keys:
    best_params[key] = int(best_params[key])
best_model = RandomForestRegressor(random_state=42, n_jobs=-1, **best_params)

X_train_val = pd.concat([X_train, X_val], axis=0)
y_train_val = pd.concat([y_train, y_val], axis=0)
best_model.fit(X_train_val, y_train_val)

final_metrics = pd.Series(
    {
        "train_val_rmse": rmse(y_train_val, best_model.predict(X_train_val)),
        "test_rmse": rmse(y_test, best_model.predict(X_test)),
    }
)
summary = pd.DataFrame([best_params])
summary["train_val_rmse"] = final_metrics["train_val_rmse"]
summary["test_rmse"] = final_metrics["test_rmse"]
summary

Unnamed: 0,n_estimators,max_depth,min_samples_split,min_samples_leaf,max_features,train_val_rmse,test_rmse
0,600,30,10,1,sqrt,3506.686014,4995.242102
