In [2]:
%load_ext autoreload
%autoreload 2

import sys, os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

import pandas as pd
from src.config import TRANSFORMED_DATA_DIR

# Load Citi Bike tabular data
df = pd.read_parquet(TRANSFORMED_DATA_DIR / "citibike_tabular_data_28d.parquet")
df["pickup_hour"].info()


<class 'pandas.core.series.Series'>
RangeIndex: 1023 entries, 0 to 1022
Series name: pickup_hour
Non-Null Count  Dtype         
--------------  -----         
1023 non-null   datetime64[ns]
dtypes: datetime64[ns](1)
memory usage: 8.1 KB


In [4]:
from datetime import datetime
from src.data_utils import split_time_series_data

X_train, y_train, X_test, y_test = split_time_series_data(
    df,
    cutoff_date=datetime(2023, 9, 1, 0, 0, 0),
    target_column="target"
)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)


(657, 674)
(657,)
(366, 674)
(366,)


In [6]:
def average_rides_last_4_weeks(X: pd.DataFrame) -> pd.DataFrame:
    last_4_weeks_columns = [f"rides_t-{7*24}", f"rides_t-{14*24}", f"rides_t-{21*24}", f"rides_t-{28*24}"]
    for col in last_4_weeks_columns:
        if col not in X.columns:
            raise ValueError(f"Missing required column: {col}")
    X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
    return X


In [8]:
from sklearn.preprocessing import FunctionTransformer
add_feature_average_rides_last_4_weeks = FunctionTransformer(average_rides_last_4_weeks, validate=False)

from sklearn.base import BaseEstimator, TransformerMixin
class TemporalFeatureEngineer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None): return self
    def transform(self, X, y=None):
        X_ = X.copy()
        X_["hour"] = X_["pickup_hour"].dt.hour
        X_["day_of_week"] = X_["pickup_hour"].dt.dayofweek
        return X_.drop(columns=["pickup_hour", "pickup_location_id"])

add_temporal_features = TemporalFeatureEngineer()

import lightgbm as lgb
from sklearn.pipeline import make_pipeline
pipeline = make_pipeline(
    add_feature_average_rides_last_4_weeks,
    add_temporal_features,
    lgb.LGBMRegressor()
)


In [10]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_absolute_error
import mlflow

param_distributions = {
    "lgbmregressor__num_leaves": [2, 50, 70, 256],
    "lgbmregressor__max_depth": [-1, 10, 20, 30],
    "lgbmregressor__learning_rate": [0.01, 0.05, 0.1, 0.2],
    "lgbmregressor__n_estimators": [100, 200, 500, 1000],
    "lgbmregressor__min_child_samples": [10, 20, 30, 50],
    "lgbmregressor__subsample": [0.6, 0.8, 1.0],
    "lgbmregressor__colsample_bytree": [0.6, 0.8, 1.0],
    "lgbmregressor__reg_alpha": [0, 0.1, 0.5, 1.0],
    "lgbmregressor__reg_lambda": [0, 0.1, 0.5, 1.0],
    "lgbmregressor__feature_fraction": [0.6, 0.7, 0.8, 0.9, 1.0], 
    "lgbmregressor__bagging_fraction": [0.6, 0.7, 0.8, 0.9, 1.0], 
    "lgbmregressor__bagging_freq": [1, 5, 10],
}

mlflow.start_run(run_name="LightGBM_LR_Tuning")

random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_distributions,
    n_iter=5,
    scoring="neg_mean_absolute_error",
    cv=3,
    verbose=2,
    random_state=42,
)
random_search.fit(X_train, y_train)


Fitting 3 folds for each of 5 candidates, totalling 15 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003437 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 20117
[LightGBM] [Info] Number of data points in the train set: 438, number of used features: 674
[LightGBM] [Info] Start training from score 13.789954
[CV] END lgbmregressor__bagging_fraction=0.8, lgbmregressor__bagging_freq=10, lgbmregressor__colsample_bytree=1.0, lgbmregressor__feature_fraction=0.6, lgbmregressor__learning_rate=0.1, lgbmregressor__max_depth=20, lgbmregressor__min_child_samples=50, lgbmregressor__n_estimators=1000, lgbmregressor__num_leaves=2, lgbmregressor__reg_alpha=0.5, lgbmregressor__reg_lambda=0.1, lgbmregressor__subsample=0.8; total time=   0.2s
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003920 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM]

In [12]:
best_lr = random_search.best_params_["lgbmregressor__learning_rate"]
y_pred_lr = random_search.best_estimator_.predict(X_test)
mae_lr = mean_absolute_error(y_test, y_pred_lr)

mlflow.log_param("best_learning_rate", best_lr)
mlflow.log_metric("test_mae_lr", mae_lr)
print("Learning Rate Tuning - Best LR:", best_lr)
print("Learning Rate Tuning - Test MAE:", mae_lr)

mlflow.end_run()


Learning Rate Tuning - Best LR: 0.01
Learning Rate Tuning - Test MAE: 6.04841953152625
🏃 View run LightGBM_LR_Tuning at: https://dagshub.com/sahilsubhasbhaivachhani/final_project.mlflow/#/experiments/0/runs/3828dd943cbe469dadf6f10cad979007
🧪 View experiment at: https://dagshub.com/sahilsubhasbhaivachhani/final_project.mlflow/#/experiments/0


In [14]:
print("Best Parameters:", random_search.best_params_)
print("Best Score (Negative MAE):", random_search.best_score_)

best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print("Test Set MAE:", mae)


Best Parameters: {'lgbmregressor__subsample': 0.8, 'lgbmregressor__reg_lambda': 0.5, 'lgbmregressor__reg_alpha': 0, 'lgbmregressor__num_leaves': 2, 'lgbmregressor__n_estimators': 1000, 'lgbmregressor__min_child_samples': 30, 'lgbmregressor__max_depth': 10, 'lgbmregressor__learning_rate': 0.01, 'lgbmregressor__feature_fraction': 0.6, 'lgbmregressor__colsample_bytree': 1.0, 'lgbmregressor__bagging_freq': 5, 'lgbmregressor__bagging_fraction': 1.0}
Best Score (Negative MAE): -5.03732760220604
Test Set MAE: 6.04841953152625


In [16]:
param_distributions_2 = {
    "lgbmregressor__learning_rate": [best_lr],
    "lgbmregressor__num_leaves": [31, 50, 70, 100],
    "lgbmregressor__max_depth": [-1, 10, 20, 30],
    "lgbmregressor__n_estimators": [100, 200, 500],
}

mlflow.start_run(run_name="LightGBM_2nd_Tuning")

random_search_2 = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_distributions_2,
    n_iter=5,
    scoring="neg_mean_absolute_error",
    cv=3,
    verbose=2,
    random_state=42,
)
random_search_2.fit(X_train, y_train)

best_params = random_search_2.best_params_
y_pred_final = random_search_2.best_estimator_.predict(X_test)
mae_final = mean_absolute_error(y_test, y_pred_final)

mlflow.log_params(best_params)
mlflow.log_metric("test_mae_final", mae_final)

print("Other Parameters Tuning - Best Params:", best_params)
print("Other Parameters Tuning - Test MAE:", mae_final)

mlflow.end_run()


Fitting 3 folds for each of 5 candidates, totalling 15 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005685 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 20117
[LightGBM] [Info] Number of data points in the train set: 438, number of used features: 674
[LightGBM] [Info] Start training from score 13.789954
[CV] END lgbmregressor__learning_rate=0.01, lgbmregressor__max_depth=20, lgbmregressor__n_estimators=100, lgbmregressor__num_leaves=100; total time=   0.1s
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002415 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19650
[LightGBM] [Info] Number of data points in the train set: 438, number of used features: 674
[LightGBM] [Info] Start training from score 9.541096
[CV] END lgbmregressor__learning_rate=0.01, lgbmregressor__max_depth=20, lgbmregressor__n_estimator