In [1]:
# Step 1: Import necessary libraries and login to Hopsworks
import hopsworks
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

project = hopsworks.login()
fs = project.get_feature_store()

print("✅ Logged into Hopsworks and imported libraries.")


  from .autonotebook import tqdm as notebook_tqdm


2025-05-03 20:34:31,863 INFO: Initializing external client
2025-05-03 20:34:31,863 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-05-03 20:34:33,110 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1215691
✅ Logged into Hopsworks and imported libraries.


In [2]:
# Step 2: Load the feature group data
feature_group = fs.get_feature_group(
    name="citi_bike_features_group",
    version=1
)

df = feature_group.read()
df.head()


Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (2.62s) 


Unnamed: 0,pickup_hour,target,lag_1,lag_2,lag_3,lag_4,lag_5,lag_6,lag_7,lag_8,...,lag_20,lag_21,lag_22,lag_23,lag_24,lag_25,lag_26,lag_27,lag_28,location_id
0,2024-04-16 18:00:00+00:00,29.0,32.0,14.0,10.0,10.0,6.0,3.0,3.0,2.0,...,4.0,11.0,18.0,32.0,24.0,13.0,5.0,4.0,6.0,JC115
1,2024-05-21 12:00:00+00:00,5.0,2.0,1.0,7.0,8.0,3.0,1.0,1.0,0.0,...,21.0,9.0,4.0,4.0,5.0,5.0,2.0,3.0,5.0,HB102
2,2024-04-10 01:00:00+00:00,0.0,0.0,1.0,2.0,6.0,8.0,7.0,29.0,28.0,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,4.0,6.0,JC115
3,2024-12-15 12:00:00+00:00,3.0,3.0,5.0,2.0,0.0,0.0,2.0,0.0,0.0,...,5.0,5.0,2.0,6.0,4.0,1.0,4.0,2.0,1.0,HB105
4,2024-04-08 08:00:00+00:00,3.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,HB105


In [3]:
#Preprocess and Train/Test Split
# Step 3: Preprocess and create train-test split

# Convert pickup_hour to datetime
df["pickup_hour"] = pd.to_datetime(df["pickup_hour"])

# Sort values to ensure time order
df = df.sort_values(["location_id", "pickup_hour"])

# Define cutoff for test set (e.g., last 30 days)
cutoff = df["pickup_hour"].max() - pd.Timedelta(days=30)
train_df = df[df["pickup_hour"] <= cutoff].copy()
test_df = df[df["pickup_hour"] > cutoff].copy()

print(f"✅ Training samples: {len(train_df)}")
print(f"✅ Testing samples: {len(test_df)}")


✅ Training samples: 24015
✅ Testing samples: 2145


In [4]:
import numpy as np
from sklearn.metrics import mean_absolute_error

y_true = test_df["target"].values
y_pred = test_df["lag_1"].values

# Calculate MAE
mae = mean_absolute_error(y_true, y_pred)

# Fix MAPE manually to avoid division by zero
non_zero_indices = y_true != 0
if non_zero_indices.sum() == 0:
    mape = np.nan
else:
    mape = np.mean(np.abs((y_true[non_zero_indices] - y_pred[non_zero_indices]) / y_true[non_zero_indices])) * 100

print(f"✅ Baseline Model - MAE: {mae:.2f}")
print(f"✅ Baseline Model - MAPE: {mape:.2f}%")


✅ Baseline Model - MAE: 2.13
✅ Baseline Model - MAPE: 76.38%


In [5]:
import dagshub

dagshub.init(repo_owner='sai-snehitha',
             repo_name='citi-bike-prediction-system',
             mlflow=True)


2025-05-03 20:34:39,367 INFO: HTTP Request: GET https://dagshub.com/api/v1/user "HTTP/1.1 200 OK"


2025-05-03 20:34:39,378 INFO: Accessing as sai-snehitha
2025-05-03 20:34:39,671 INFO: HTTP Request: GET https://dagshub.com/api/v1/repos/sai-snehitha/citi-bike-prediction-system "HTTP/1.1 200 OK"
2025-05-03 20:34:39,913 INFO: HTTP Request: GET https://dagshub.com/api/v1/user "HTTP/1.1 200 OK"


2025-05-03 20:34:39,917 INFO: Initialized MLflow to track repo "sai-snehitha/citi-bike-prediction-system"


2025-05-03 20:34:39,919 INFO: Repository sai-snehitha/citi-bike-prediction-system initialized!


In [6]:
import mlflow

with mlflow.start_run(run_name="test-logging"):
    mlflow.log_param("test_param", 123)
    mlflow.log_metric("test_metric", 0.95)
    print("✅ Test run logged to DagsHub MLflow")


✅ Test run logged to DagsHub MLflow
🏃 View run test-logging at: https://dagshub.com/sai-snehitha/citi-bike-prediction-system.mlflow/#/experiments/0/runs/d1e9a765077149bda48368e75f81a4b6
🧪 View experiment at: https://dagshub.com/sai-snehitha/citi-bike-prediction-system.mlflow/#/experiments/0


In [7]:
import dagshub
dagshub.init(repo_owner='sai-snehitha', repo_name='citi-bike-prediction-system', mlflow=True)

import mlflow
with mlflow.start_run(run_name="Baseline - Lag_1"):
    mlflow.log_param("model_type", "baseline")
    mlflow.log_param("features_used", "lag_1")
    mlflow.log_param("data_split", "last_30_days")

    mlflow.log_metric("MAE", mae)
    mlflow.log_metric("MAPE", mape)

    print("✅ Baseline model logged to DagsHub MLflow")


2025-05-03 20:34:41,870 INFO: HTTP Request: GET https://dagshub.com/api/v1/repos/sai-snehitha/citi-bike-prediction-system "HTTP/1.1 200 OK"


2025-05-03 20:34:41,879 INFO: Initialized MLflow to track repo "sai-snehitha/citi-bike-prediction-system"


2025-05-03 20:34:41,881 INFO: Repository sai-snehitha/citi-bike-prediction-system initialized!
✅ Baseline model logged to DagsHub MLflow
🏃 View run Baseline - Lag_1 at: https://dagshub.com/sai-snehitha/citi-bike-prediction-system.mlflow/#/experiments/0/runs/dcb20d298c1049fab60aaa5c60c8cee9
🧪 View experiment at: https://dagshub.com/sai-snehitha/citi-bike-prediction-system.mlflow/#/experiments/0
