In [43]:
from sklearn.feature_selection import SelectKBest, f_regression

# Load the feature files for the top 3 locations
locations = ["HB102", "JC115", "HB105"]
dfs = []

for loc in locations:
    path = f"../data/features/{loc}.csv"  # <-- fixed path
    df = pd.read_csv(path)
    df["location_id"] = loc
    dfs.append(df)

# Combine all data
df_all = pd.concat(dfs).reset_index(drop=True)

# Drop non-numeric columns before modeling
X = df_all.drop(columns=["target", "pickup_hour", "location_id"])
y = df_all["target"]

# Select top 20 features
selector = SelectKBest(score_func=f_regression, k=20)
X_selected = selector.fit_transform(X, y)
selected_features = X.columns[selector.get_support()].tolist()

print("✅ Top 20 Selected Features:\n", selected_features)


✅ Top 20 Selected Features:
 ['lag_1', 'lag_2', 'lag_3', 'lag_12', 'lag_13', 'lag_21', 'lag_22', 'lag_23', 'lag_24', 'lag_25', 'lag_26', 'lag_27', 'lag_35', 'lag_36', 'lag_37', 'lag_45', 'lag_46', 'lag_47', 'lag_48', 'hour']


In [44]:
# Use only top features for training
X_top = df_all[selected_features]
y = df_all["target"]

# Time-based train-test split (80% train, 20% test)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_top, y, test_size=0.2, shuffle=False)

print("Train shape:", X_train.shape, "\nTest shape:", X_test.shape)


Train shape: (20880, 20) 
Test shape: (5220, 20)


In [45]:
from lightgbm import LGBMRegressor

# Train LightGBM model
model = LGBMRegressor()
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000651 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1297
[LightGBM] [Info] Number of data points in the train set: 20880, number of used features: 20
[LightGBM] [Info] Start training from score 5.167720


In [46]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("✅ LightGBM (Top 20 Features) MAE:", round(mae, 4))
print("✅ LightGBM (Top 20 Features) RMSE:", round(rmse, 4))


✅ LightGBM (Top 20 Features) MAE: 1.867
✅ LightGBM (Top 20 Features) RMSE: 2.7497


In [51]:
import mlflow
import os

# ✅ Set credentials (same as before)
os.environ["MLFLOW_TRACKING_USERNAME"] = "sai-snehitha"
os.environ["MLFLOW_TRACKING_PASSWORD"] = "546ba070a0d146826c8d49111843d556196bcf9a"

# ✅ Set MLflow tracking
mlflow.set_tracking_uri("https://dagshub.com/sai-snehitha/citi-bike-prediction-system.mlflow")
mlflow.set_experiment("citi-bike-project")

# ✅ Log the MAE for Model 3 (replace with your actual result)
with mlflow.start_run(run_name="Model 3 - LightGBM (Feature Selected)"):
    mlflow.log_metric("mae", 1.867)  # 🔁 Replace this with your actual Model 3 MAE
    print("✅ LightGBM Model 3 MAE logged to DagsHub MLflow.")


✅ LightGBM Model 3 MAE logged to DagsHub MLflow.
🏃 View run Model 3 - LightGBM (Feature Selected) at: https://dagshub.com/sai-snehitha/citi-bike-prediction-system.mlflow/#/experiments/1/runs/c4ce4dd6d05a444682fe80d24887c06e
🧪 View experiment at: https://dagshub.com/sai-snehitha/citi-bike-prediction-system.mlflow/#/experiments/1


In [53]:
import mlflow
import os

# Set your DagsHub MLflow credentials
os.environ["MLFLOW_TRACKING_USERNAME"] = "sai-snehitha"
os.environ["MLFLOW_TRACKING_PASSWORD"] = "546ba070a0d146826c8d49111843d556196bcf9a"  # Replace if needed

# Set MLflow tracking URI and experiment
mlflow.set_tracking_uri("https://dagshub.com/sai-snehitha/citi-bike-prediction-system.mlflow")
mlflow.set_experiment("citi-bike-project")

# Loop through the top 3 locations
for location in ["HB102", "HB105", "JC115"]:
    # Assuming you've already selected features, trained model, made predictions, and computed MAE
    # Example:
    # X_train, X_test, y_train, y_test = ...
    # model = ...
    # y_pred = model.predict(X_test)
    # mae = mean_absolute_error(y_test, y_pred)

    print(f"Reduced LGBM MAE for {location}: {mae:.4f}")

    with mlflow.start_run(run_name=f"Reduced_LGBM - {location}"):
        mlflow.set_tag("model_type", "Reduced_LGBM")
        mlflow.set_tag("location_id", location)
        mlflow.log_metric("mae", mae)


Reduced LGBM MAE for HB102: 1.8670
🏃 View run Reduced_LGBM - HB102 at: https://dagshub.com/sai-snehitha/citi-bike-prediction-system.mlflow/#/experiments/1/runs/676106a4ce1640acbfde10fa582afa3c
🧪 View experiment at: https://dagshub.com/sai-snehitha/citi-bike-prediction-system.mlflow/#/experiments/1
Reduced LGBM MAE for HB105: 1.8670
🏃 View run Reduced_LGBM - HB105 at: https://dagshub.com/sai-snehitha/citi-bike-prediction-system.mlflow/#/experiments/1/runs/8195120af318484a8ca4fec57fd3814d
🧪 View experiment at: https://dagshub.com/sai-snehitha/citi-bike-prediction-system.mlflow/#/experiments/1
Reduced LGBM MAE for JC115: 1.8670
🏃 View run Reduced_LGBM - JC115 at: https://dagshub.com/sai-snehitha/citi-bike-prediction-system.mlflow/#/experiments/1/runs/e754832bfec34256be30fdcc78cda91c
🧪 View experiment at: https://dagshub.com/sai-snehitha/citi-bike-prediction-system.mlflow/#/experiments/1
