In [10]:
from pathlib import Path
import math

import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import ParameterSampler
from sklearn.tree import DecisionTreeRegressor

In [11]:
encoded_path = "../../datasets/processed/enriched_sample_encoded.csv"
raw_path = "../../datasets/raw/enriched_sample.csv"

encoded_df = pd.read_csv(encoded_path)
dates = pd.read_csv(raw_path, usecols=["Date"])
encoded_df["Date"] = pd.to_datetime(dates["Date"])
encoded_df.head()

Unnamed: 0,Traffic Volume,Average Speed,Travel Time Index,Congestion Level,Incident Reports,Public Transport Usage,Traffic Signal Compliance,Parking Usage,Pedestrian and Cyclist Count,Roadwork and Construction Activity,...,Road/Intersection Name_Jayanagar 4th Block,Road/Intersection Name_Marathahalli Bridge,Road/Intersection Name_Sarjapur Road,Road/Intersection Name_Silk Board Junction,Road/Intersection Name_Sony World Junction,Road/Intersection Name_South End Circle,Road/Intersection Name_Trinity Circle,Road/Intersection Name_Tumkur Road,Road/Intersection Name_Yeshwanthpur Circle,Date
0,50590,50.230299,1.5,100.0,0,70.63233,84.0446,85.403629,111,0,...,0,0,0,0,0,0,0,0,0,2022-01-01
1,30825,29.377125,1.5,100.0,1,41.924899,91.407038,59.983689,100,0,...,0,0,0,0,0,0,0,0,0,2022-01-01
2,7399,54.474398,1.039069,28.347994,0,44.662384,61.375541,95.46602,189,0,...,0,1,0,0,0,0,0,0,0,2022-01-01
3,60874,43.81761,1.5,100.0,1,32.773123,75.547092,63.567452,111,0,...,0,0,0,0,1,0,0,0,0,2022-01-01
4,57292,41.116763,1.5,100.0,3,35.092601,64.634762,93.155171,104,0,...,0,0,1,0,0,0,0,0,0,2022-01-01


In [12]:
year_counts = encoded_df["Date"].dt.year.value_counts().sort_index()
year_counts

Date
2022    3424
2023    3413
2024    2099
Name: count, dtype: int64

In [13]:
date_series = encoded_df["Date"]
target = encoded_df["Traffic Volume"]
features = encoded_df.drop(columns=["Traffic Volume"])

train_mask = date_series.dt.year == 2022
val_mask = date_series.dt.year == 2023
test_mask = date_series.dt.year == 2024

X_train = features.loc[train_mask].drop(columns=["Date"])
X_val = features.loc[val_mask].drop(columns=["Date"])
X_test = features.loc[test_mask].drop(columns=["Date"])

y_train = target.loc[train_mask]
y_val = target.loc[val_mask]
y_test = target.loc[test_mask]

X_train.shape, X_val.shape, X_test.shape

((3424, 54), (3413, 54), (2099, 54))

In [14]:
def rmse(y_true, y_pred):
    mse= mean_squared_error(y_true, y_pred)
    return math.sqrt(mse)

baseline_model = DecisionTreeRegressor(random_state=42,max_depth=5,min_samples_split=20,min_samples_leaf=8,max_features=0.8)
baseline_model.fit(X_train, y_train)

baseline_metrics = pd.Series(
     {
         "train_rmse": rmse(y_train, baseline_model.predict(X_train)),
         "val_rmse": rmse(y_val, baseline_model.predict(X_val)),
         "test_rmse": rmse(y_test, baseline_model.predict(X_test)),
     }
)
baseline_metrics


train_rmse    5190.424464
val_rmse      5381.242816
test_rmse     5122.662840
dtype: float64

In [15]:
param_dist = {
    "max_depth": [5, 10, 15, 20, None],
    "min_samples_split": [2, 5, 10, 20],
    "min_samples_leaf": [1, 2, 4, 8],
    "max_features": ["sqrt", "log2", None, 0.8],
}
n_iter = 15
search_results = []

for params in ParameterSampler(param_dist, n_iter=n_iter, random_state=42):
    model = DecisionTreeRegressor(random_state=42, **params)
    model.fit(X_train, y_train)
    train_rmse = rmse(y_train, model.predict(X_train))
    val_rmse = rmse(y_val, model.predict(X_val))
    search_results.append(
        {
            "max_depth": params["max_depth"],
            "min_samples_split": params["min_samples_split"],
            "min_samples_leaf": params["min_samples_leaf"],
            "max_features": params["max_features"],
            "train_rmse": train_rmse,
            "val_rmse": val_rmse,
        }
    )

search_df = pd.DataFrame(search_results).sort_values("val_rmse", ascending=True).reset_index(drop=True)
search_df

Unnamed: 0,max_depth,min_samples_split,min_samples_leaf,max_features,train_rmse,val_rmse
0,5,20,8,0.8,5190.424464,5381.242816
1,10,10,8,,4380.472312,5824.261103
2,10,2,8,sqrt,5351.464983,5955.377142
3,15,20,8,sqrt,4928.155156,6125.928496
4,15,2,8,,3853.561014,6262.476115
5,15,20,2,,3689.979678,6271.50761
6,15,10,4,0.8,3293.085334,6488.711005
7,20,10,2,,2705.185736,6703.767482
8,5,5,4,sqrt,7134.302598,7109.177435
9,20,2,1,,884.128585,7195.752845


In [16]:
best_params = search_df.iloc[0][["max_depth", "min_samples_split", "min_samples_leaf", "max_features"]].to_dict()
best_model = DecisionTreeRegressor(random_state=42, **best_params)
X_train_val = pd.concat([X_train, X_val], axis=0)
y_train_val = pd.concat([y_train, y_val], axis=0)
best_model.fit(X_train_val, y_train_val)

final_metrics = pd.Series(
    {
        "train_val_rmse": rmse(y_train_val, best_model.predict(X_train_val)),
        "test_rmse": rmse(y_test, best_model.predict(X_test)),
    }
)
summary = pd.DataFrame([best_params])
summary["train_val_rmse"] = final_metrics["train_val_rmse"]
summary["test_rmse"] = final_metrics["test_rmse"]
summary

Unnamed: 0,max_depth,min_samples_split,min_samples_leaf,max_features,train_val_rmse,test_rmse
0,5,20,8,0.8,5213.134039,5120.33677
