In [1]:
import sys
import os

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

In [2]:
import pandas as pd
from src.config import TRANSFORMED_DATA_DIR

df = pd.read_parquet(TRANSFORMED_DATA_DIR / "tabular_data.parquet")

In [3]:
from datetime import datetime

from src.data_utils import split_time_series_data

X_train, y_train, X_test, y_test = split_time_series_data(
    df,
    cutoff_date=datetime(2023, 8, 1, 0, 0, 0),
    target_column="target"
)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(47840, 674)
(47840,)
(39780, 674)
(39780,)


In [4]:
def average_rides_last_4_weeks(X: pd.DataFrame) -> pd.DataFrame:
    last_4_weeks_columns = [
            f"rides_t-{7*24}",  # 1 week ago
            f"rides_t-{14*24}", # 2 weeks ago
            f"rides_t-{21*24}", # 3 weeks ago
            f"rides_t-{28*24}"  # 4 weeks ago
        ]

        # Ensure the required columns exist in the test DataFrame
    for col in last_4_weeks_columns:
        if col not in X.columns:
            raise ValueError(f"Missing required column: {col}")

    # Calculate the average of the last 4 weeks
    X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)

    return X

from sklearn.preprocessing import FunctionTransformer

add_feature_average_rides_last_4_weeks = FunctionTransformer(
    average_rides_last_4_weeks, validate=False
)

In [5]:
from sklearn.base import BaseEstimator, TransformerMixin

class TemporalFeatureEngineer(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_ = X.copy()
        X_["hour"] = X_["pickup_hour"].dt.hour
        X_["day_of_week"] = X_["pickup_hour"].dt.dayofweek

        return X_.drop(columns=["pickup_hour", "pickup_location_id"])

add_temporal_features = TemporalFeatureEngineer()

In [6]:
import lightgbm as lgb

from sklearn.pipeline import make_pipeline

pipeline = make_pipeline(
    add_feature_average_rides_last_4_weeks,
    add_temporal_features,
    lgb.LGBMRegressor()
)

In [7]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_absolute_error
import numpy as np

# Define the parameter grid
param_dist = {
    'lgbmregressor__num_leaves': [31, 50, 70, 100, 150],
    'lgbmregressor__learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3],
    'lgbmregressor__n_estimators': [100, 200, 500, 1000],
    'lgbmregressor__max_depth': [-1, 10, 20, 30, 50],
    'lgbmregressor__min_child_samples': [10, 20, 50, 100],
    'lgbmregressor__subsample': [0.6, 0.8, 0.9, 1.0],
    'lgbmregressor__colsample_bytree': [0.6, 0.8, 0.9, 1.0],
    'lgbmregressor__reg_alpha': [0, 0.1, 0.5, 1.0],
    'lgbmregressor__reg_lambda': [0, 0.1, 0.5, 1.0]
}

random_search = RandomizedSearchCV(
    estimator=pipeline, param_distributions=param_dist, n_iter=5, cv=3, scoring='neg_mean_absolute_error', random_state=42, verbose=2
)
random_search.fit(X_train, y_train)

# Get the best parameters and the best score
print("Best Parameters:", random_search.best_params_)
print("Best Score (Negative MAE):", random_search.best_score_)

# Evaluate the best model on the test set
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print("Test Set MAE:", mae)

Fitting 3 folds for each of 5 candidates, totalling 15 fits


[WinError 2] The system cannot find the file specified
  File "d:\Taxi_Project\nyc_taxi\taxi_venv\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
        "wmic CPU Get NumberOfCores /Format:csv".split(),
        capture_output=True,
        text=True,
    )
  File "C:\Users\glenl\AppData\Local\Programs\Python\Python313\Lib\subprocess.py", line 556, in run
    with Popen(*popenargs, **kwargs) as process:
         ~~~~~^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\glenl\AppData\Local\Programs\Python\Python313\Lib\subprocess.py", line 1038, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
    ~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
                        pass_fds, cwd, env,
                        ^^^^^^^^^^^^^^^^^^^
    ...<5 lines>...
                        gid, gids, uid, umask,
                        ^^^^^^^^^^^^^^^^^^^^^^
                        start_new_sessi

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.278351 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 154557
[LightGBM] [Info] Number of data points in the train set: 31893, number of used features: 674
[LightGBM] [Info] Start training from score 13.775374
[CV] END lgbmregressor__colsample_bytree=0.6, lgbmregressor__learning_rate=0.3, lgbmregressor__max_depth=30, lgbmregressor__min_child_samples=100, lgbmregressor__n_estimators=200, lgbmregressor__num_leaves=31, lgbmregressor__reg_alpha=0.5, lgbmregressor__reg_lambda=0.1, lgbmregressor__subsample=0.9; total time=  18.1s
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.220095 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 151804
[LightGBM] [Info] Number of data points in the train set: 31893, number of used features: 674
[LightGBM] [Info] Start training from scor

### Definition and Impact of Parameters in RandomizedSearchCV

In the `RandomizedSearchCV` cell, we define a parameter grid (`param_dist`) for hyperparameter tuning of the `LGBMRegressor`. Below are the definitions and impacts of each parameter:

1. **num_leaves**:
    - **Definition**: Maximum number of leaves in one tree.
    - **Impact**: Increasing `num_leaves` can improve the model's accuracy but may lead to overfitting. It also increases the model complexity and training time.

2. **learning_rate**:
    - **Definition**: Step size shrinkage used to prevent overfitting.
    - **Impact**: Lower values make the model more robust but require more trees. Higher values speed up training but may lead to overfitting.

3. **n_estimators**:
    - **Definition**: Number of boosting iterations (trees).
    - **Impact**: More trees can improve model performance but also increase training time and risk of overfitting.

4. **max_depth**:
    - **Definition**: Maximum depth of a tree.
    - **Impact**: Controls the complexity of the model. Deeper trees can capture more information but may overfit the data.

5. **min_child_samples**:
    - **Definition**: Minimum number of data points in a leaf node.
    - **Impact**: Helps control overfitting. Higher values prevent the model from learning overly specific patterns.

6. **subsample**:
    - **Definition**: Fraction of data to be used for fitting each tree.
    - **Impact**: Prevents overfitting by introducing randomness. Lower values can improve generalization but may reduce model accuracy.

7. **colsample_bytree**:
    - **Definition**: Fraction of features to be used for fitting each tree.
    - **Impact**: Reduces overfitting by introducing randomness. Lower values can improve generalization but may reduce model accuracy.

8. **reg_alpha**:
    - **Definition**: L1 regularization term on weights.
    - **Impact**: Adds a penalty for large coefficients, encouraging sparsity and reducing overfitting.

9. **reg_lambda**:
    - **Definition**: L2 regularization term on weights.
    - **Impact**: Adds a penalty for large coefficients, reducing overfitting but not encouraging sparsity.

By tuning these parameters, we aim to find the best combination that minimizes the mean absolute error (MAE) on the validation set, leading to a more accurate and generalizable model.

In [9]:
from src.experiment_utils import set_mlflow_tracking, log_model_to_mlflow
from dotenv import load_dotenv
import os
load_dotenv() 

mlflow = set_mlflow_tracking()
log_model_to_mlflow(random_search, X_test, "LGBMRegressorWFE_Hyper", "mean_absolute_error", score=mae)

INFO:src.experiment_utils:MLflow tracking URI and credentials set.


INFO:src.experiment_utils:Experiment set to: LGBMRegressorWFE_Hyper
INFO:src.experiment_utils:Logged mean_absolute_error: 2.936101470652163
INFO:src.experiment_utils:Model signature inferred.
Successfully registered model 'RandomizedSearchCV'.
2025/03/04 02:34:56 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: RandomizedSearchCV, version 1
Created version '1' of model 'RandomizedSearchCV'.
INFO:src.experiment_utils:Model logged with name: RandomizedSearchCV


🏃 View run tasteful-boar-583 at: https://dagshub.com/rockyglen/nyc_taxi.mlflow/#/experiments/6/runs/bf6bfdfb355b4dd682da4dc74141e72c
🧪 View experiment at: https://dagshub.com/rockyglen/nyc_taxi.mlflow/#/experiments/6


<mlflow.models.model.ModelInfo at 0x1b6c46217f0>