In [1]:
import pandas as pd
import numpy as np
import mlflow
import mlflow.lightgbm
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error
from dotenv import load_dotenv
import os
import hopsworks

# Load .env (for Hopsworks & DagsHub creds)
load_dotenv()

# Hopsworks Login
project = hopsworks.login(api_key_value=os.getenv("HOPSWORKS_API_KEY"), project=os.getenv("HOPSWORKS_PROJECT"))
fs = project.get_feature_store()
fg = fs.get_feature_group(name="citi_bike_trips", version=1)  # ⬅️ change this
df = fg.read()

# Convert to hourly
df['start_hour'] = pd.to_datetime(df['started_at']).dt.floor('H')


2025-05-10 14:01:23,583 INFO: Initializing external client
2025-05-10 14:01:23,584 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-05-10 14:01:26,984 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1228957
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (4.70s) 


In [2]:
hourly_df = df.groupby('start_hour').size().reset_index(name='trip_count')
hourly_df['hour_of_day'] = hourly_df['start_hour'].dt.hour
hourly_df = hourly_df.sort_values('start_hour').reset_index(drop=True)


In [3]:
# Split
split_idx = int(len(hourly_df) * 0.8)
train = hourly_df.iloc[:split_idx]
test = hourly_df.iloc[split_idx:]

# Predict using hourly mean
mean_per_hour = train.groupby('hour_of_day')['trip_count'].mean()
test.loc[:, 'predicted'] = test['hour_of_day'].map(mean_per_hour)

# MAE
baseline_mae = mean_absolute_error(test['trip_count'], test['predicted'])
print(f"📉 Baseline MAE: {baseline_mae:.2f}")

# Log to DagsHub
username = os.getenv("DAGSHUB_USERNAME")
token = os.getenv("DAGSHUB_TOKEN")
repo = os.getenv("DAGSHUB_REPO_NAME")
mlflow.set_tracking_uri(f"https://{username}:{token}@dagshub.com/{username}/{repo}.mlflow")

with mlflow.start_run(run_name="Baseline_Mean_Per_Hour"):
    mlflow.log_param("model_type", "mean_per_hour")
    mlflow.log_param("features_used", "hour_of_day")
    mlflow.log_metric("MAE", baseline_mae)

    mean_per_hour.to_csv("mean_hour_lookup.csv")
    mlflow.log_artifact("mean_hour_lookup.csv")
    test[['start_hour', 'trip_count', 'predicted']].to_csv("baseline_preds.csv", index=False)
    mlflow.log_artifact("baseline_preds.csv")


📉 Baseline MAE: 18.08


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


🏃 View run Baseline_Mean_Per_Hour at: https://rockyglen:91640b3ddca4ff37282e7c57c52747b2e5511bda@dagshub.com/rockyglen/citi_bike_project.mlflow/#/experiments/0/runs/553ed4e598884ceb864b57031847bd15
🧪 View experiment at: https://rockyglen:91640b3ddca4ff37282e7c57c52747b2e5511bda@dagshub.com/rockyglen/citi_bike_project.mlflow/#/experiments/0


In [4]:
# Create lag features
for lag in range(1, 29):
    hourly_df[f'lag_{lag}'] = hourly_df['trip_count'].shift(lag)
hourly_df = hourly_df.dropna().reset_index(drop=True)

# Split
split_idx = int(len(hourly_df) * 0.8)
train = hourly_df.iloc[:split_idx]
test = hourly_df.iloc[split_idx:]

X_train = train[[f'lag_{i}' for i in range(1, 29)]]
y_train = train['trip_count']
X_test = test[[f'lag_{i}' for i in range(1, 29)]]
y_test = test['trip_count']

# Train
model = lgb.LGBMRegressor()
model.fit(X_train, y_train)
preds = model.predict(X_test)
lag_mae = mean_absolute_error(y_test, preds)
print(f"🚀 LightGBM (28 lags) MAE: {lag_mae:.2f}")

# Log
with mlflow.start_run(run_name="LightGBM_28_Lags"):
    mlflow.log_param("model_type", "LightGBM")
    mlflow.log_param("lags_used", 28)
    mlflow.log_metric("MAE", lag_mae)

    mlflow.lightgbm.log_model(model, artifact_path="model")
    test[['start_hour', 'trip_count']].assign(predicted=preds).to_csv("lgbm_preds.csv", index=False)
    mlflow.log_artifact("lgbm_preds.csv")


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000635 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5270
[LightGBM] [Info] Number of data points in the train set: 6584, number of used features: 28
[LightGBM] [Info] Start training from score 45.706409
🚀 LightGBM (28 lags) MAE: 8.37




🏃 View run LightGBM_28_Lags at: https://rockyglen:91640b3ddca4ff37282e7c57c52747b2e5511bda@dagshub.com/rockyglen/citi_bike_project.mlflow/#/experiments/0/runs/f17b01c3b7aa4a0a8a183524c8d4b5cf
🧪 View experiment at: https://rockyglen:91640b3ddca4ff37282e7c57c52747b2e5511bda@dagshub.com/rockyglen/citi_bike_project.mlflow/#/experiments/0


In [None]:
import os
import pandas as pd
import numpy as np
import lightgbm as lgb
import mlflow
import mlflow.lightgbm
from sklearn.metrics import mean_absolute_error
from dotenv import load_dotenv
import hopsworks

# ---------------------
# Load environment variables
# ---------------------
load_dotenv()

# ---------------------
# Connect to Hopsworks
# ---------------------
project = hopsworks.login(
    api_key_value=os.getenv("HOPSWORKS_API_KEY"),
    project=os.getenv("HOPSWORKS_PROJECT")
)
fs = project.get_feature_store()
fg = fs.get_feature_group(name="citi_bike_trips", version=1)
df = fg.read()

# ---------------------
# Aggregate data by hour
# ---------------------
df['start_hour'] = pd.to_datetime(df['started_at']).dt.floor('H')
hourly_df = df.groupby('start_hour').size().reset_index(name='trip_count')
hourly_df['hour_of_day'] = hourly_df['start_hour'].dt.hour
hourly_df = hourly_df.sort_values('start_hour').reset_index(drop=True)

# ---------------------
# Baseline Model: Mean per hour-of-day
# ---------------------
split_idx = int(len(hourly_df) * 0.8)
train = hourly_df.iloc[:split_idx]
test = hourly_df.iloc[split_idx:]

mean_per_hour = train.groupby('hour_of_day')['trip_count'].mean()
test.loc[:, 'predicted'] = test['hour_of_day'].map(mean_per_hour)
baseline_mae = mean_absolute_error(test['trip_count'], test['predicted'])
print(f"📉 Baseline MAE: {baseline_mae:.2f}")

# ---------------------
# MLflow tracking setup (DagsHub)
# ---------------------
username = os.getenv("DAGSHUB_USERNAME")
token = os.getenv("DAGSHUB_TOKEN")
repo = os.getenv("DAGSHUB_REPO_NAME")
mlflow.set_tracking_uri(f"https://{username}:{token}@dagshub.com/{username}/{repo}.mlflow")

with mlflow.start_run(run_name="Baseline_Mean_Per_Hour"):
    mlflow.log_param("model_type", "mean_per_hour")
    mlflow.log_param("features_used", "hour_of_day")
    mlflow.log_metric("MAE", baseline_mae)
    mean_per_hour.to_csv("mean_hour_lookup.csv")
    mlflow.log_artifact("mean_hour_lookup.csv")
    test[['start_hour', 'trip_count', 'predicted']].to_csv("baseline_preds.csv", index=False)
    mlflow.log_artifact("baseline_preds.csv")

# ---------------------
# Full Lag Model (28 lag features)
# ---------------------
for lag in range(1, 29):
    hourly_df[f'lag_{lag}'] = hourly_df['trip_count'].shift(lag)
hourly_df = hourly_df.dropna().reset_index(drop=True)

split_idx = int(len(hourly_df) * 0.8)
train = hourly_df.iloc[:split_idx]
test = hourly_df.iloc[split_idx:]

X_train = train[[f'lag_{i}' for i in range(1, 29)]]
y_train = train['trip_count']
X_test = test[[f'lag_{i}' for i in range(1, 29)]]
y_test = test['trip_count']

model = lgb.LGBMRegressor()
model.fit(X_train, y_train)
preds = model.predict(X_test)
lag_mae = mean_absolute_error(y_test, preds)
print(f"🚀 LightGBM (28 lags) MAE: {lag_mae:.2f}")

with mlflow.start_run(run_name="LightGBM_28_Lags"):
    mlflow.log_param("model_type", "LightGBM")
    mlflow.log_param("lags_used", 28)
    mlflow.log_metric("MAE", lag_mae)
    mlflow.lightgbm.log_model(model, artifact_path="model")
    test[['start_hour', 'trip_count']].assign(predicted=preds).to_csv("lgbm_preds.csv", index=False)
    mlflow.log_artifact("lgbm_preds.csv")

# ---------------------
# Feature-Reduced Model (Top 10 lags)
# ---------------------
importances = model.feature_importances_
top10 = [f'lag_{i}' for i, _ in sorted(enumerate(importances, 1), key=lambda x: x[1], reverse=True)[:10]]

X_train_red = train[top10]
X_test_red = test[top10]

model_red = lgb.LGBMRegressor()
model_red.fit(X_train_red, y_train)
preds_red = model_red.predict(X_test_red)
reduced_mae = mean_absolute_error(y_test, preds_red)
print(f"⚡ Feature-Reduced MAE: {reduced_mae:.2f}")

with mlflow.start_run(run_name="LightGBM_Feature_Reduced"):
    mlflow.log_param("model_type", "LightGBM")
    mlflow.log_param("features_used", str(top10))
    mlflow.log_metric("MAE", reduced_mae)
    mlflow.lightgbm.log_model(model_red, artifact_path="model")
    test[['start_hour', 'trip_count']].assign(predicted=preds_red).to_csv("reduced_preds.csv", index=False)
    mlflow.log_artifact("reduced_preds.csv")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010164 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1885
[LightGBM] [Info] Number of data points in the train set: 6584, number of used features: 10
[LightGBM] [Info] Start training from score 45.706409
⚡ Feature-Reduced MAE: 8.32




🏃 View run LightGBM_Feature_Reduced at: https://rockyglen:91640b3ddca4ff37282e7c57c52747b2e5511bda@dagshub.com/rockyglen/citi_bike_project.mlflow/#/experiments/0/runs/5d6ca43fbb504432b45eba06cd53e75e
🧪 View experiment at: https://rockyglen:91640b3ddca4ff37282e7c57c52747b2e5511bda@dagshub.com/rockyglen/citi_bike_project.mlflow/#/experiments/0
