In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor


In [6]:
df = pd.read_csv("../data/processed/ml_features_v1.csv")

In [7]:
df.shape

(3014, 12)

In [8]:
df.head()

Unnamed: 0,year,month,day_of_week,is_weekend,lag_1,lag_7,lag_14,lag_30,rolling_mean_7,rolling_mean_30,rolling_std_7,Boardings
0,2017,7,0,0,23297.0,7913.0,14107.0,13086.0,14201.142857,15168.266667,4731.559765,15374
1,2017,8,1,0,15374.0,11548.0,14843.0,21709.0,15267.0,15244.533333,3834.241342,15244
2,2017,8,2,0,15244.0,16024.0,14749.0,16934.0,15795.0,15029.033333,3474.346941,11989
3,2017,8,3,0,11989.0,13036.0,13819.0,13260.0,15218.571429,14864.2,3753.527801,14906
4,2017,8,4,0,14906.0,13771.0,15995.0,13222.0,15485.714286,14919.066667,3637.039809,15698


In [9]:
X = df.drop("Boardings", axis=1)
y = df["Boardings"]

In [10]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
# Naive baseline: predict using previous hour's boardings
y_true = y_test
y_pred_naive = X_test["lag_1"]

mae_naive = mean_absolute_error(y_true, y_pred_naive)
print("Naive MAE:", mae_naive)

In [11]:
model = RandomForestRegressor(
    n_estimators=200,
    random_state=42
)

In [18]:
model.fit(X_train, y_train)

preds = model.predict(X_test)
mae_rf = mean_absolute_error(y_test, preds)

print("MAE:", mae_rf)

MAE: 3232.327686567164


In [13]:
importances = pd.Series(
    model.feature_importances_,
    index=X.columns
).sort_values(ascending=False)

importances.head(10)


rolling_mean_7     0.520666
rolling_mean_30    0.095551
lag_1              0.079160
lag_14             0.062752
lag_7              0.055985
day_of_week        0.052647
is_weekend         0.035322
rolling_std_7      0.030081
lag_30             0.028321
month              0.026669
dtype: float64

In [None]:
# Gradient Boosting Regressor
gbr = GradientBoostingRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=3,
    random_state=42
)

# Train
gbr.fit(X_train, y_train)

# Predict
y_pred_gbr = gbr.predict(X_test)

# Evaluate
mae_gbr = mean_absolute_error(y_test, y_pred_gbr)
print(f"Gradient Boosting MAE: {mae_gbr}")


Gradient Boosting MAE: 3271.7814546528894


In [None]:
gbr_importance = pd.Series(
    gbr.feature_importances_,
    index=X_train.columns
).sort_values(ascending=False)

gbr_importance


rolling_mean_7     0.458266
lag_1              0.178064
rolling_mean_30    0.083121
lag_14             0.066817
lag_7              0.063478
day_of_week        0.054657
is_weekend         0.045785
month              0.021131
rolling_std_7      0.015012
lag_30             0.009659
year               0.004010
dtype: float64

In [19]:
results = pd.DataFrame({
    "Model": [
        "Naive (Lag-1)",
        "Random Forest",
        "Gradient Boosting"
    ],
    "MAE": [
        mae_naive,
        mae_rf,
        mae_gbr
    ]
})

results


Unnamed: 0,Model,MAE
0,Naive (Lag-1),4370.951907
1,Random Forest,3232.327687
2,Gradient Boosting,3271.781455
