In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

DATA_CSV = Path("data/raw/velib_timeseries_5min.csv")
FREQ_MIN = 5
HORIZONS = [15, 30, 60]

# --- Mode DEV : échantillonner des stations "actives" pour itérer vite
DEV_MODE = False        # passe à True pour tester rapidement
DEV_N_STATIONS = 200    # nb de stations à garder en DEV

usecols = ["ts","station_id","bikes_available","capacity"]  # suffisant pour ce notebook
dtype = {
    "station_id": "category",
    "bikes_available": "float32",   # int16 possible, mais on reste large
    "capacity": "float32",
}
df = pd.read_csv(DATA_CSV, usecols=usecols, dtype=dtype, parse_dates=["ts"])
df = df.sort_values(["station_id","ts"]).reset_index(drop=True)

if DEV_MODE:
    var_by_sta = df.groupby("station_id")["bikes_available"].var().sort_values(ascending=False)
    keep = var_by_sta.head(DEV_N_STATIONS).index
    df = df[df["station_id"].isin(keep)].copy()

df.info(memory_usage="deep")
df.head(3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3305428 entries, 0 to 3305427
Data columns (total 4 columns):
 #   Column           Dtype              
---  ------           -----              
 0   ts               datetime64[ns, UTC]
 1   station_id       category           
 2   bikes_available  float32            
 3   capacity         float32            
dtypes: category(1), datetime64[ns, UTC](1), float32(2)
memory usage: 56.9 MB


Unnamed: 0,ts,station_id,bikes_available,capacity
0,2025-08-21 17:45:00+00:00,1002059045,23.0,27.0
1,2025-08-21 17:50:00+00:00,1002059045,25.0,27.0
2,2025-08-21 17:55:00+00:00,1002059045,23.0,27.0


In [11]:
from sklearn.metrics import mean_absolute_error

tmp = df.copy()
tmp["hour"] = tmp["ts"].dt.hour
tmp["dow"]  = tmp["ts"].dt.dayofweek

# cibles multi-horizon
for h in HORIZONS:
    tmp[f"y_{h}"] = tmp.groupby("station_id")["bikes_available"].shift(-(h//FREQ_MIN))

# split temporel 70/30 (respecte la chronologie)
cutoff = tmp["ts"].quantile(0.7)
train_df = tmp[tmp["ts"] <= cutoff]
test_df  = tmp[tmp["ts"] >  cutoff]

# médiane historique par (station, jour, heure) calculée sur train
med = (train_df
       .groupby(["station_id","dow","hour"])["bikes_available"]
       .median()
       .rename("pred_seasonal")
       .reset_index())

test_seasonal = test_df.merge(med, on=["station_id","dow","hour"], how="left")

seasonal_mae = {}
for h in HORIZONS:
    y = test_seasonal[f"y_{h}"]
    p = test_seasonal["pred_seasonal"]
    ok = y.notna() & p.notna()
    seasonal_mae[h] = float(mean_absolute_error(y[ok], p[ok]))
seasonal_mae

{15: 5.903214931488037, 30: 5.915978908538818, 60: 5.96649694442749}

In [12]:
def make_features_pandas(data, horizons=HORIZONS, freq_min=FREQ_MIN):
    out = data.copy()
    out["hour"] = out["ts"].dt.hour.astype("uint8")
    out["dow"]  = out["ts"].dt.dayofweek.astype("uint8")
    # encodage cyclique
    out["hour_sin"] = np.sin(2*np.pi*out["hour"]/24).astype("float32")
    out["hour_cos"] = np.cos(2*np.pi*out["hour"]/24).astype("float32")

    # lags : 5, 10, 15, 30, 60 min
    for k in [1,2,3,6,12]:
        out[f"lag_{k*freq_min}"] = out.groupby("station_id")["bikes_available"].shift(k).astype("float32")

    # rolling 1h sur t-1 (évite fuite)
    out["roll_mean_60"] = (out.groupby("station_id")["bikes_available"]
                             .shift(1).rolling(12).mean()
                             .reset_index(level=0, drop=True).astype("float32"))

    # delta court terme
    out["delta_5"] = (out["bikes_available"] - out.groupby("station_id")["bikes_available"].shift(1)).astype("float32")

    # targets
    for h in horizons:
        out[f"y_{h}"] = out.groupby("station_id")["bikes_available"].shift(-(h//freq_min)).astype("float32")

    out = out.dropna().reset_index(drop=True)
    return out

df_feat = make_features_pandas(df)
df_feat.head(3)

Unnamed: 0,ts,station_id,bikes_available,capacity,hour,dow,hour_sin,hour_cos,lag_5,lag_10,lag_15,lag_30,lag_60,roll_mean_60,delta_5,y_15,y_30,y_60
0,2025-08-21 18:45:00+00:00,1002059045,19.0,27.0,18,3,-1.0,-1.83697e-16,17.0,19.0,23.0,22.0,23.0,20.833334,2.0,19.0,18.0,21.0
1,2025-08-21 18:50:00+00:00,1002059045,19.0,27.0,18,3,-1.0,-1.83697e-16,19.0,17.0,19.0,21.0,25.0,20.5,0.0,18.0,16.0,20.0
2,2025-08-21 18:55:00+00:00,1002059045,19.0,27.0,18,3,-1.0,-1.83697e-16,19.0,19.0,17.0,21.0,23.0,20.0,0.0,18.0,18.0,21.0


In [13]:
X_cols = [
    "hour_sin","hour_cos","dow","capacity","delta_5","roll_mean_60",
    "lag_5","lag_10","lag_15","lag_30","lag_60",
]
y_cols = [f"y_{h}" for h in HORIZONS]

cut = df_feat["ts"].quantile(0.7)
train = df_feat[df_feat["ts"] <= cut]
test  = df_feat[df_feat["ts"] >  cut]

X_train, y_train = train[X_cols].astype("float32"), train[y_cols].astype("float32")
X_test,  y_test  = test[X_cols].astype("float32"),  test[y_cols].astype("float32")

X_train.shape, X_test.shape

((2290105, 11), (980091, 11))

In [14]:
from sklearn.linear_model import Ridge
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_absolute_error

models = {
    "Ridge": MultiOutputRegressor(Ridge(alpha=1.0, random_state=42)),
    "HGBR": MultiOutputRegressor(HistGradientBoostingRegressor(
        max_depth=6, learning_rate=0.08, max_iter=400, random_state=42))
}

scores = {}
for name, m in models.items():
    m.fit(X_train, y_train)
    pred = m.predict(X_test)
    scores[name] = {h: float(mean_absolute_error(y_test.iloc[:, i], pred[:, i]))
                    for i, h in enumerate(HORIZONS)}
scores

{'Ridge': {15: 0.8099028468132019,
  30: 1.2351136207580566,
  60: 1.8224033117294312},
 'HGBR': {15: 0.8053530677578495,
  30: 1.212714058145366,
  60: 1.7675516458203147}}

In [15]:
# Naïve : prédire "valeur courante" pour toutes les cibles
# On récupère la valeur à t dans le même jeu 'test'
naive_mae = {}
# on a encore 'bikes_available' dans df_feat ; réaligne sur le même 'test'
test_view = test[["bikes_available"] + y_cols].copy()
for i, h in enumerate(HORIZONS):
    y = test_view[y_cols[i]].astype("float32")
    p = test_view["bikes_available"].astype("float32")
    ok = y.notna() & p.notna()
    naive_mae[h] = float(mean_absolute_error(y[ok], p[ok]))

# Tableau comparatif
comp = pd.DataFrame({
    "Horizon_min": HORIZONS,
    "MAE_Naive":   [naive_mae[h] for h in HORIZONS],
    "MAE_Seasonal":[seasonal_mae[h] for h in HORIZONS],
    "MAE_Ridge":   [scores["Ridge"][h] for h in HORIZONS],
    "MAE_HGBR":    [scores["HGBR"][h] for h in HORIZONS],
})
comp

Unnamed: 0,Horizon_min,MAE_Naive,MAE_Seasonal,MAE_Ridge,MAE_HGBR
0,15,0.754617,5.903215,0.809903,0.805353
1,30,1.160963,5.915979,1.235114,1.212714
2,60,1.737528,5.966497,1.822403,1.767552


In [16]:
print(comp.to_string(index=False, float_format=lambda x: f"{x:.3f}"))

 Horizon_min  MAE_Naive  MAE_Seasonal  MAE_Ridge  MAE_HGBR
          15      0.755         5.903      0.810     0.805
          30      1.161         5.916      1.235     1.213
          60      1.738         5.966      1.822     1.768
