In [11]:
# Imports
import pandas as pd
import numpy as np
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split

# Load the feature files for the top 3 locations
locations = ["HB102", "JC115", "HB105"]
dfs = []

for loc in locations:
    path = f"../data/features/{loc}.csv"
    df = pd.read_csv(path, index_col="pickup_hour")  # read with pickup_hour as index
    df.index = pd.to_datetime(df.index)
    df = df.reset_index()  # move pickup_hour into column
    df["location_id"] = loc
    dfs.append(df)

# Combine into a single DataFrame
df_all = pd.concat(dfs).reset_index(drop=True)
df_all.head()


Unnamed: 0,pickup_hour,target,lag_1,lag_2,lag_3,lag_4,lag_5,lag_6,lag_7,lag_8,...,lag_43,lag_44,lag_45,lag_46,lag_47,lag_48,hour,dayofweek,is_weekend,location_id
0,2024-01-03 00:00:00,0,1.0,0.0,1.0,1.0,10.0,18.0,20.0,12.0,...,0.0,2.0,2.0,20.0,5.0,5.0,0,2,0,HB102
1,2024-01-03 01:00:00,0,0.0,1.0,0.0,1.0,1.0,10.0,18.0,20.0,...,0.0,0.0,2.0,2.0,20.0,5.0,1,2,0,HB102
2,2024-01-03 02:00:00,0,0.0,0.0,1.0,0.0,1.0,1.0,10.0,18.0,...,0.0,0.0,0.0,2.0,2.0,20.0,2,2,0,HB102
3,2024-01-03 03:00:00,0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,10.0,...,0.0,0.0,0.0,0.0,2.0,2.0,3,2,0,HB102
4,2024-01-03 04:00:00,0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,...,1.0,0.0,0.0,0.0,0.0,2.0,4,2,0,HB102


In [14]:
# Drop non-numeric columns
X = df_all.drop(columns=["target", "pickup_hour", "location_id"])  # drop object column
y = df_all["target"]

# Train-test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# Train LightGBM model
model = LGBMRegressor()
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000799 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3249
[LightGBM] [Info] Number of data points in the train set: 20880, number of used features: 51
[LightGBM] [Info] Start training from score 5.167720


In [15]:
# Train the model on all features
model = LGBMRegressor()
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000623 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3249
[LightGBM] [Info] Number of data points in the train set: 20880, number of used features: 51
[LightGBM] [Info] Start training from score 5.167720


In [16]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

# Evaluate
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# Display results
print("✅ LightGBM (All Features) MAE:", round(mae, 4))
print("✅ LightGBM (All Features) RMSE:", round(rmse, 4))


✅ LightGBM (All Features) MAE: 1.8343
✅ LightGBM (All Features) RMSE: 2.718


In [17]:
import mlflow
import os

# ✅ Set credentials (same as before)
os.environ["MLFLOW_TRACKING_USERNAME"] = "sai-snehitha"
os.environ["MLFLOW_TRACKING_PASSWORD"] = "546ba070a0d146826c8d49111843d556196bcf9a"

# ✅ Set MLflow tracking
mlflow.set_tracking_uri("https://dagshub.com/sai-snehitha/citi-bike-prediction-system.mlflow")
mlflow.set_experiment("citi-bike-project")

# ✅ Log the MAE for Model 2 (replace with actual value if different)
with mlflow.start_run(run_name="Model 2 - LightGBM (Full Lag Features)"):
    mlflow.log_metric("mae", 1.8343)  
    print("✅ LightGBM Model 2 MAE logged to DagsHub MLflow.")


✅ LightGBM Model 2 MAE logged to DagsHub MLflow.
🏃 View run Model 2 - LightGBM (Full Lag Features) at: https://dagshub.com/sai-snehitha/citi-bike-prediction-system.mlflow/#/experiments/1/runs/279136f183bb41e3b0135ed1cf6a38b9
🧪 View experiment at: https://dagshub.com/sai-snehitha/citi-bike-prediction-system.mlflow/#/experiments/1


In [19]:
import mlflow
import lightgbm as lgb  # ✅ Add this

with mlflow.start_run():
    model = lgb.LGBMRegressor()
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)

    mlflow.log_metric("mae", mae)

    # ✅ Register this as the best model
    mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path="model",
        registered_model_name="citi_bike_best_model"
    )

print("✅ Registered Model 2 as citi_bike_best_model")


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000751 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3249
[LightGBM] [Info] Number of data points in the train set: 20880, number of used features: 51
[LightGBM] [Info] Start training from score 5.167720


Registered model 'citi_bike_best_model' already exists. Creating a new version of this model...
2025/05/08 20:19:52 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: citi_bike_best_model, version 2
Created version '2' of model 'citi_bike_best_model'.


🏃 View run hilarious-midge-354 at: https://dagshub.com/sai-snehitha/citi-bike-prediction-system.mlflow/#/experiments/1/runs/b8ab4ce4452344ec8abde135b1b984c2
🧪 View experiment at: https://dagshub.com/sai-snehitha/citi-bike-prediction-system.mlflow/#/experiments/1
✅ Registered Model 2 as citi_bike_best_model


In [20]:
import mlflow
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error

mlflow.set_tracking_uri("https://dagshub.com/sai-snehitha/citi-bike-prediction-system.mlflow")
mlflow.set_registry_uri("https://dagshub.com/sai-snehitha/citi-bike-prediction-system.mlflow")

with mlflow.start_run() as run:
    model = lgb.LGBMRegressor()
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    
    mlflow.log_metric("mae", mae)

    # ✅ Register this as "best model"
    mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path="model",
        registered_model_name="citi_bike_best_model"
    )

    print("✅ Registered Model 2 as citi_bike_best_model")


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000659 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3249
[LightGBM] [Info] Number of data points in the train set: 20880, number of used features: 51
[LightGBM] [Info] Start training from score 5.167720


Registered model 'citi_bike_best_model' already exists. Creating a new version of this model...
2025/05/08 20:32:24 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: citi_bike_best_model, version 3
Created version '3' of model 'citi_bike_best_model'.


✅ Registered Model 2 as citi_bike_best_model
🏃 View run sincere-ray-250 at: https://dagshub.com/sai-snehitha/citi-bike-prediction-system.mlflow/#/experiments/1/runs/b8e72f9a9ae64e8d9d28ca009dfdb9ba
🧪 View experiment at: https://dagshub.com/sai-snehitha/citi-bike-prediction-system.mlflow/#/experiments/1


In [21]:
import mlflow
import lightgbm as lgb
import pandas as pd
from sklearn.metrics import mean_absolute_error

with mlflow.start_run():
    model = lgb.LGBMRegressor()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)

    mlflow.log_metric("mae", mae)

    # ✅ Prepare input_example and signature
    input_example = X_test.iloc[:2]
    from mlflow.models.signature import infer_signature
    signature = infer_signature(X_test, y_pred)

    # ✅ Re-register model with full metadata
    mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path="model",
        registered_model_name="citi_bike_best_model",
        input_example=input_example,
        signature=signature
    )

print("✅ Re-registered correct model with signature.")


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000789 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3249
[LightGBM] [Info] Number of data points in the train set: 20880, number of used features: 51
[LightGBM] [Info] Start training from score 5.167720


Registered model 'citi_bike_best_model' already exists. Creating a new version of this model...
2025/05/08 20:35:52 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: citi_bike_best_model, version 4
Created version '4' of model 'citi_bike_best_model'.


🏃 View run spiffy-zebra-197 at: https://dagshub.com/sai-snehitha/citi-bike-prediction-system.mlflow/#/experiments/1/runs/ba6c0904173d4ff3b3ffa2ab08cbae5c
🧪 View experiment at: https://dagshub.com/sai-snehitha/citi-bike-prediction-system.mlflow/#/experiments/1
✅ Re-registered correct model with signature.


In [22]:
import mlflow
import os

# Set MLflow DagsHub credentials if not already set
os.environ["MLFLOW_TRACKING_USERNAME"] = "sai-snehitha"
os.environ["MLFLOW_TRACKING_PASSWORD"] = "546ba070a0d146826c8d49111843d556196bcf9a"  # replace if expired

# Set MLflow tracking URI and experiment
mlflow.set_tracking_uri("https://dagshub.com/sai-snehitha/citi-bike-prediction-system.mlflow")
mlflow.set_experiment("citi-bike-project")

# Loop through locations (if not already inside one)
for location in ["HB102", "HB105", "JC115"]:
    # (Assuming you’ve already trained model, made predictions, computed mae here)
    
    # Example if re-computing:
    # X = ...
    # y = ...
    # y_pred = model.predict(X_test)
    # mae = mean_absolute_error(y_test, y_pred)

    print(f"📊 LightGBM Full MAE for {location}: {mae:.4f}")

    with mlflow.start_run(run_name=f"Full_LGBM - {location}"):
        mlflow.set_tag("model_type", "Full_LGBM")
        mlflow.set_tag("location_id", location)
        mlflow.log_metric("mae", mae)


📊 LightGBM Full MAE for HB102: 1.8343
🏃 View run Full_LGBM - HB102 at: https://dagshub.com/sai-snehitha/citi-bike-prediction-system.mlflow/#/experiments/1/runs/21f973c601064bec8a32e10a8fbf62c9
🧪 View experiment at: https://dagshub.com/sai-snehitha/citi-bike-prediction-system.mlflow/#/experiments/1
📊 LightGBM Full MAE for HB105: 1.8343
🏃 View run Full_LGBM - HB105 at: https://dagshub.com/sai-snehitha/citi-bike-prediction-system.mlflow/#/experiments/1/runs/193b3adead224af3a28e76d96b8bdbbe
🧪 View experiment at: https://dagshub.com/sai-snehitha/citi-bike-prediction-system.mlflow/#/experiments/1
📊 LightGBM Full MAE for JC115: 1.8343
🏃 View run Full_LGBM - JC115 at: https://dagshub.com/sai-snehitha/citi-bike-prediction-system.mlflow/#/experiments/1/runs/ab41052747114dfe9a122163ea56878e
🧪 View experiment at: https://dagshub.com/sai-snehitha/citi-bike-prediction-system.mlflow/#/experiments/1
