In [2]:
%load_ext autoreload
%autoreload 2


In [4]:
import sys
import os

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))


In [6]:
import pandas as pd
from src.config import TRANSFORMED_DATA_DIR

# Load 28-day window Citi Bike features
df = pd.read_parquet(TRANSFORMED_DATA_DIR / "citibike_tabular_data_28d.parquet")


In [8]:
from datetime import datetime
from src.data_utils import split_time_series_data

X_train, y_train, X_test, y_test = split_time_series_data(
    df,
    cutoff_date=datetime(2023, 9, 1, 0, 0, 0),
    target_column="target"
)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)


(657, 674)
(657,)
(366, 674)
(366,)


In [10]:
past_ride_columns = [c for c in X_train.columns if c.startswith("rides_")]
X_train_only_numeric = X_train[past_ride_columns]
X_test_only_numeric = X_test[past_ride_columns]


In [12]:
def average_rides_last_4_weeks(X: pd.DataFrame) -> pd.DataFrame:
    last_4_weeks_columns = [
        f"rides_t-{7*24}",  # 1 week ago
        f"rides_t-{14*24}", # 2 weeks ago
        f"rides_t-{21*24}", # 3 weeks ago
        f"rides_t-{28*24}"  # 4 weeks ago
    ]

    for col in last_4_weeks_columns:
        if col not in X.columns:
            raise ValueError(f"Missing required column: {col}")

    X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
    return X


In [14]:
from sklearn.preprocessing import FunctionTransformer

add_feature_average_rides_last_4_weeks = FunctionTransformer(
    average_rides_last_4_weeks, validate=False
)
add_feature_average_rides_last_4_weeks.fit_transform(X_train)


Unnamed: 0,rides_t-672,rides_t-671,rides_t-670,rides_t-669,rides_t-668,rides_t-667,rides_t-666,rides_t-665,rides_t-664,rides_t-663,...,rides_t-7,rides_t-6,rides_t-5,rides_t-4,rides_t-3,rides_t-2,rides_t-1,pickup_hour,pickup_location_id,average_rides_last_4_weeks
0,1,0,0,0,0,0,0,0,0,0,...,0,0,5,12,15,12,4,2023-01-25 11:00:00,5329.03,4.00
1,0,0,0,0,0,0,0,0,0,0,...,0,1,5,12,21,6,8,2023-01-26 11:00:00,5329.03,3.00
2,0,0,0,0,0,0,0,0,0,0,...,0,1,1,10,12,6,8,2023-01-27 11:00:00,5329.03,7.00
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,2,6,5,10,2023-01-28 11:00:00,5329.03,11.50
4,15,12,10,24,23,14,12,6,5,5,...,0,0,0,1,2,9,6,2023-01-29 11:00:00,5329.03,11.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
896,23,22,20,50,46,28,38,37,30,17,...,0,2,2,0,2,14,8,2023-08-27 11:00:00,6948.10,17.50
897,17,20,19,12,29,27,43,48,27,13,...,1,2,3,5,15,11,9,2023-08-28 11:00:00,6948.10,9.75
898,14,12,9,11,12,12,26,23,26,12,...,2,2,3,4,23,15,11,2023-08-29 11:00:00,6948.10,12.25
899,16,13,13,9,20,17,31,20,15,6,...,2,2,9,3,15,0,7,2023-08-30 11:00:00,6948.10,11.00


In [16]:
from sklearn.base import BaseEstimator, TransformerMixin

class TemporalFeatureEngineer(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_ = X.copy()
        X_["hour"] = X_["pickup_hour"].dt.hour
        X_["day_of_week"] = X_["pickup_hour"].dt.dayofweek
        return X_.drop(columns=["pickup_hour", "pickup_location_id"])

add_temporal_features = TemporalFeatureEngineer()
add_temporal_features.fit_transform(X_train)


Unnamed: 0,rides_t-672,rides_t-671,rides_t-670,rides_t-669,rides_t-668,rides_t-667,rides_t-666,rides_t-665,rides_t-664,rides_t-663,...,rides_t-7,rides_t-6,rides_t-5,rides_t-4,rides_t-3,rides_t-2,rides_t-1,average_rides_last_4_weeks,hour,day_of_week
0,1,0,0,0,0,0,0,0,0,0,...,0,0,5,12,15,12,4,4.00,11,2
1,0,0,0,0,0,0,0,0,0,0,...,0,1,5,12,21,6,8,3.00,11,3
2,0,0,0,0,0,0,0,0,0,0,...,0,1,1,10,12,6,8,7.00,11,4
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,2,6,5,10,11.50,11,5
4,15,12,10,24,23,14,12,6,5,5,...,0,0,0,1,2,9,6,11.00,11,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
896,23,22,20,50,46,28,38,37,30,17,...,0,2,2,0,2,14,8,17.50,11,6
897,17,20,19,12,29,27,43,48,27,13,...,1,2,3,5,15,11,9,9.75,11,0
898,14,12,9,11,12,12,26,23,26,12,...,2,2,3,4,23,15,11,12.25,11,1
899,16,13,13,9,20,17,31,20,15,6,...,2,2,9,3,15,0,7,11.00,11,2


In [18]:
import lightgbm as lgb
from sklearn.pipeline import make_pipeline

pipeline = make_pipeline(
    add_feature_average_rides_last_4_weeks,
    add_temporal_features,
    lgb.LGBMRegressor()
)

pipeline.fit(X_train, y_train)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003491 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 23004
[LightGBM] [Info] Number of data points in the train set: 657, number of used features: 674
[LightGBM] [Info] Start training from score 13.375951


In [20]:
add_temporal_features = TemporalFeatureEngineer()
add_temporal_features.fit_transform(X_train)

Unnamed: 0,rides_t-672,rides_t-671,rides_t-670,rides_t-669,rides_t-668,rides_t-667,rides_t-666,rides_t-665,rides_t-664,rides_t-663,...,rides_t-7,rides_t-6,rides_t-5,rides_t-4,rides_t-3,rides_t-2,rides_t-1,average_rides_last_4_weeks,hour,day_of_week
0,1,0,0,0,0,0,0,0,0,0,...,0,0,5,12,15,12,4,4.00,11,2
1,0,0,0,0,0,0,0,0,0,0,...,0,1,5,12,21,6,8,3.00,11,3
2,0,0,0,0,0,0,0,0,0,0,...,0,1,1,10,12,6,8,7.00,11,4
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,2,6,5,10,11.50,11,5
4,15,12,10,24,23,14,12,6,5,5,...,0,0,0,1,2,9,6,11.00,11,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
896,23,22,20,50,46,28,38,37,30,17,...,0,2,2,0,2,14,8,17.50,11,6
897,17,20,19,12,29,27,43,48,27,13,...,1,2,3,5,15,11,9,9.75,11,0
898,14,12,9,11,12,12,26,23,26,12,...,2,2,3,4,23,15,11,12.25,11,1
899,16,13,13,9,20,17,31,20,15,6,...,2,2,9,3,15,0,7,11.00,11,2


In [22]:
from sklearn.metrics import mean_absolute_error

predictions = pipeline.predict(X_test)
test_mae = mean_absolute_error(y_test, predictions)
print(f"{test_mae:.4f}")


5.6361


In [24]:
X_train

Unnamed: 0,rides_t-672,rides_t-671,rides_t-670,rides_t-669,rides_t-668,rides_t-667,rides_t-666,rides_t-665,rides_t-664,rides_t-663,...,rides_t-7,rides_t-6,rides_t-5,rides_t-4,rides_t-3,rides_t-2,rides_t-1,pickup_hour,pickup_location_id,average_rides_last_4_weeks
0,1,0,0,0,0,0,0,0,0,0,...,0,0,5,12,15,12,4,2023-01-25 11:00:00,5329.03,4.00
1,0,0,0,0,0,0,0,0,0,0,...,0,1,5,12,21,6,8,2023-01-26 11:00:00,5329.03,3.00
2,0,0,0,0,0,0,0,0,0,0,...,0,1,1,10,12,6,8,2023-01-27 11:00:00,5329.03,7.00
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,2,6,5,10,2023-01-28 11:00:00,5329.03,11.50
4,15,12,10,24,23,14,12,6,5,5,...,0,0,0,1,2,9,6,2023-01-29 11:00:00,5329.03,11.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
896,23,22,20,50,46,28,38,37,30,17,...,0,2,2,0,2,14,8,2023-08-27 11:00:00,6948.10,17.50
897,17,20,19,12,29,27,43,48,27,13,...,1,2,3,5,15,11,9,2023-08-28 11:00:00,6948.10,9.75
898,14,12,9,11,12,12,26,23,26,12,...,2,2,3,4,23,15,11,2023-08-29 11:00:00,6948.10,12.25
899,16,13,13,9,20,17,31,20,15,6,...,2,2,9,3,15,0,7,2023-08-30 11:00:00,6948.10,11.00


In [26]:
from src.experiment_utils import set_mlflow_tracking, log_model_to_mlflow
from dotenv import load_dotenv

load_dotenv()
mlflow = set_mlflow_tracking()

log_model_to_mlflow(pipeline, X_test, "LGBMRegressorWFE-Citibike", "mean_absolute_error", score=test_mae)


INFO:src.experiment_utils:MLflow tracking URI and credentials set.
2025/05/09 21:29:48 INFO mlflow.tracking.fluent: Experiment with name 'LGBMRegressorWFE-Citibike' does not exist. Creating a new experiment.
INFO:src.experiment_utils:Experiment set to: LGBMRegressorWFE-Citibike
INFO:src.experiment_utils:Logged mean_absolute_error: 5.6360656031957
INFO:src.experiment_utils:Model signature inferred.
Successfully registered model 'Pipeline'.
2025/05/09 21:29:58 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Pipeline, version 1
Created version '1' of model 'Pipeline'.
INFO:src.experiment_utils:Model logged with name: Pipeline


🏃 View run loud-frog-612 at: https://dagshub.com/sahilsubhasbhaivachhani/final_project.mlflow/#/experiments/5/runs/ba10b31d948049d78fd344e3c8a117a3
🧪 View experiment at: https://dagshub.com/sahilsubhasbhaivachhani/final_project.mlflow/#/experiments/5


<mlflow.models.model.ModelInfo at 0x335da2990>