In [4]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" 

In [5]:
import sys
import os

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))


In [6]:
import pandas as pd
from src.config import TRANSFORMED_DATA_DIR

df = pd.read_parquet(TRANSFORMED_DATA_DIR / "tabular_data.parquet")
df

Unnamed: 0,rides_t-672,rides_t-671,rides_t-670,rides_t-669,rides_t-668,rides_t-667,rides_t-666,rides_t-665,rides_t-664,rides_t-663,...,rides_t-7,rides_t-6,rides_t-5,rides_t-4,rides_t-3,rides_t-2,rides_t-1,pickup_hour,pickup_location_id,target
0,5,0,0,0,0,0,7,0,0,0,...,0,18,0,0,0,0,0,2023-01-29 00:00:00,HB101,15
1,24,0,0,0,0,0,7,0,0,0,...,0,3,0,0,0,0,0,2023-01-29 12:00:00,HB101,27
2,2,0,0,0,0,0,13,0,0,0,...,0,13,0,0,0,0,0,2023-01-30 00:00:00,HB101,1
3,34,0,0,0,0,0,18,0,0,0,...,0,17,0,0,0,0,0,2023-01-30 12:00:00,HB101,17
4,1,0,0,0,0,0,11,0,0,0,...,0,24,0,0,0,0,0,2023-01-31 00:00:00,HB101,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57959,17,0,0,0,0,0,5,0,0,0,...,0,3,0,0,0,0,0,2023-12-29 12:00:00,JC116,17
57960,4,0,0,0,0,0,6,0,0,0,...,0,9,0,0,0,0,0,2023-12-30 00:00:00,JC116,0
57961,27,0,0,0,0,0,14,0,0,0,...,0,7,0,0,0,0,0,2023-12-30 12:00:00,JC116,13
57962,0,0,0,0,0,0,1,0,0,0,...,0,7,0,0,0,0,0,2023-12-31 00:00:00,JC116,4


In [9]:
from datetime import datetime

from src.data_utils import split_time_series_data

X_train, y_train, X_test, y_test = split_time_series_data(
    df,
    cutoff_date=datetime(2023, 9, 1, 0, 0, 0),
    target_column="target"
)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(36980, 674)
(36980,)
(20984, 674)
(20984,)


In [14]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA

class CitiBikePredictor(BaseEstimator, RegressorMixin):
    def __init__(self, use_pca=False, n_components=10, random_state=42):
        """
        Args:
            use_pca: whether to apply PCA or just use raw features
            n_components: number of PCA components (only used if use_pca=True)
        """
        self.use_pca = use_pca
        self.n_components = n_components
        self.random_state = random_state
        
        # Internal objects
        self.scaler = None
        self.pca = None
        self.model = None
        self.feature_cols = None

    def fit(self, X, y):
        # Select only rides_t-* columns
        self.feature_cols = [col for col in X.columns if col.startswith("rides_t-")]
        X_filtered = X[self.feature_cols]
        
        # Scale
        self.scaler = StandardScaler()
        X_scaled = self.scaler.fit_transform(X_filtered)

        # Optional PCA
        if self.use_pca:
            self.pca = PCA(n_components=self.n_components, random_state=self.random_state)
            X_transformed = self.pca.fit_transform(X_scaled)
        else:
            X_transformed = X_scaled

        # Train model
        self.model = RandomForestRegressor(random_state=self.random_state)
        self.model.fit(X_transformed, y)
        return self

    def predict(self, X):
        # Expect full DataFrame
        X_filtered = X[self.feature_cols]
        X_scaled = self.scaler.transform(X_filtered)

        if self.use_pca:
            X_transformed = self.pca.transform(X_scaled)
        else:
            X_transformed = X_scaled

        return self.model.predict(X_transformed)


In [15]:
# Train model
model = CitiBikePredictor(use_pca=False)  # or use_pca=True for PCA version
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluate
from sklearn.metrics import mean_absolute_error
test_mae = mean_absolute_error(y_test, y_pred)
print(f"Test MAE: {test_mae:.2f}")


Test MAE: 2.78


In [None]:
from src.experiment_utils import set_mlflow_tracking, log_model_to_mlflow
from dotenv import load_dotenv
sys.path
load_dotenv() 
log_model_to_mlflow(model, X_test, experiment_name="RandomRegressor_feature_reduced_without_pca", metric_name="mean_absolute_error", score=test_mae)

['/opt/anaconda3/envs/citiride_pred_env/lib/python311.zip',
 '/opt/anaconda3/envs/citiride_pred_env/lib/python3.11',
 '/opt/anaconda3/envs/citiride_pred_env/lib/python3.11/lib-dynload',
 '',
 '/opt/anaconda3/envs/citiride_pred_env/lib/python3.11/site-packages',
 '/Users/vamsisaigarapati/Documents/github/citibike_ride_prediction']

True

INFO:src.experiment_utils:MLflow tracking URI and credentials set.
INFO:src.experiment_utils:Experiment set to: RandomRegressor_feature_reduced_without_pca
INFO:src.experiment_utils:Logged mean_absolute_error: 2.780011415092333
INFO:src.experiment_utils:Model signature inferred.
Successfully registered model 'CitiBikePredictor'.
2025/05/09 20:34:23 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: CitiBikePredictor, version 1
Created version '1' of model 'CitiBikePredictor'.
INFO:src.experiment_utils:Model logged with name: CitiBikePredictor


🏃 View run delicate-whale-318 at: https://dagshub.com/vamsisaigarapati/citibike_ride_prediction.mlflow/#/experiments/4/runs/0c67afff2ab641ee97a4615b2353e17c
🧪 View experiment at: https://dagshub.com/vamsisaigarapati/citibike_ride_prediction.mlflow/#/experiments/4


<mlflow.models.model.ModelInfo at 0x16aa6b410>

In [18]:
# Train model
model = CitiBikePredictor(use_pca=True)  # or use_pca=True for PCA version
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluate
from sklearn.metrics import mean_absolute_error
test_mae = mean_absolute_error(y_test, y_pred)
print(f"Test MAE: {test_mae:.2f}")


Test MAE: 3.16


In [19]:
from src.experiment_utils import set_mlflow_tracking, log_model_to_mlflow
from dotenv import load_dotenv
sys.path
load_dotenv() 
log_model_to_mlflow(model, X_test, experiment_name="RandomRegressor_feature_reduced_with_pca", metric_name="mean_absolute_error", score=test_mae)

['/opt/anaconda3/envs/citiride_pred_env/lib/python311.zip',
 '/opt/anaconda3/envs/citiride_pred_env/lib/python3.11',
 '/opt/anaconda3/envs/citiride_pred_env/lib/python3.11/lib-dynload',
 '',
 '/opt/anaconda3/envs/citiride_pred_env/lib/python3.11/site-packages',
 '/Users/vamsisaigarapati/Documents/github/citibike_ride_prediction']

True

INFO:src.experiment_utils:MLflow tracking URI and credentials set.
2025/05/09 20:37:24 INFO mlflow.tracking.fluent: Experiment with name 'RandomRegressor_feature_reduced_with_pca' does not exist. Creating a new experiment.
INFO:src.experiment_utils:Experiment set to: RandomRegressor_feature_reduced_with_pca
INFO:src.experiment_utils:Logged mean_absolute_error: 3.155717991443833
INFO:src.experiment_utils:Model signature inferred.
Registered model 'CitiBikePredictor' already exists. Creating a new version of this model...
2025/05/09 20:37:49 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: CitiBikePredictor, version 2
Created version '2' of model 'CitiBikePredictor'.
INFO:src.experiment_utils:Model logged with name: CitiBikePredictor


🏃 View run adaptable-smelt-596 at: https://dagshub.com/vamsisaigarapati/citibike_ride_prediction.mlflow/#/experiments/5/runs/bfc25dd6840f4f358a95ebfb762f92f9
🧪 View experiment at: https://dagshub.com/vamsisaigarapati/citibike_ride_prediction.mlflow/#/experiments/5


<mlflow.models.model.ModelInfo at 0x16a7fc390>