In [1]:
# === 1) Setup & load ===
import numpy as np, pandas as pd
from pathlib import Path
from sklearn.metrics import mean_absolute_error

# FICHIER À ADAPTER SI BESOIN
DATA = Path("data/raw/velib_timeseries_5min.csv")

FREQ_MIN = 5
HORIZONS = [15, 30, 60]      # minutes
SPLIT_TRAINTEST = 0.70       # 70% / 30% (split temporel)
SPLIT_TRAINVAL  = 0.85       # 85% / 15% (dans TRAIN pour early stopping)

# Chargement
usecols = ["ts","station_id","bikes_available","capacity"]
dtypes  = {"station_id":"category","bikes_available":"float32","capacity":"float32"}
df = pd.read_csv(DATA, usecols=usecols, dtype=dtypes, parse_dates=["ts"])\
       .sort_values(["station_id","ts"]).reset_index(drop=True)

# Capacité robuste par station (pour ratio & clip)
df["capacity"] = (df.groupby("station_id")["capacity"]
                    .transform(lambda s: s.ffill().bfill().fillna(s.max()))
                    .astype("float32"))
df = df[df["capacity"] > 0].copy()

# Ratio d'occupation
df["occ"] = (df["bikes_available"] / df["capacity"]).clip(0, 1).astype("float32")

print("Rows:", len(df), "| Stations:", df["station_id"].nunique())
df.head(3)


  df["capacity"] = (df.groupby("station_id")["capacity"]


Rows: 3304414 | Stations: 1468


Unnamed: 0,ts,station_id,bikes_available,capacity,occ
0,2025-08-21 17:45:00+00:00,1002059045,23.0,27.0,0.851852
1,2025-08-21 17:50:00+00:00,1002059045,25.0,27.0,0.925926
2,2025-08-21 17:55:00+00:00,1002059045,23.0,27.0,0.851852


In [2]:
# === 2) Baseline Naïve (bikes) ===
tmp = df.copy()
for h in HORIZONS:
    sh = h // FREQ_MIN
    tmp[f"occ_{h}"] = tmp.groupby("station_id")["occ"].shift(-sh).astype("float32")

cut = tmp["ts"].quantile(SPLIT_TRAINTEST)
test_naive = tmp[tmp["ts"] > cut].copy()

mae_naive_bikes = {}
for h in HORIZONS:
    y = test_naive[f"occ_{h}"].dropna()
    p_occ = test_naive.loc[y.index, "occ"]
    cap   = test_naive.loc[y.index, "capacity"]
    mae_naive_bikes[h] = float(mean_absolute_error(y*cap, p_occ*cap))

print("MAE Naïve (bikes):", {h: round(v,3) for h,v in mae_naive_bikes.items()})


  tmp[f"occ_{h}"] = tmp.groupby("station_id")["occ"].shift(-sh).astype("float32")
  tmp[f"occ_{h}"] = tmp.groupby("station_id")["occ"].shift(-sh).astype("float32")
  tmp[f"occ_{h}"] = tmp.groupby("station_id")["occ"].shift(-sh).astype("float32")


MAE Naïve (bikes): {15: 0.75, 30: 1.156, 60: 1.739}


In [3]:
# === 3) Features & targets (future occupancy) ===
feat = df.copy().sort_values(["station_id","ts"]).reset_index(drop=True)

# Calendrier au temps t
feat["hour"] = feat["ts"].dt.hour.astype("uint8")
feat["dow"]  = feat["ts"].dt.dayofweek.astype("uint8")
feat["is_weekend"] = (feat["dow"]>=5).astype("uint8")
feat["hour_sin"] = np.sin(2*np.pi*feat["hour"]/24).astype("float32")
feat["hour_cos"] = np.cos(2*np.pi*feat["hour"]/24).astype("float32")

# Lags d'occupation (5,10,15,30,60)
def shift(col, k): return feat.groupby("station_id")[col].shift(k)
for k in [1,2,3,6,12]:
    feat[f"occ_lag_{k*FREQ_MIN}"] = shift("occ", k).astype("float32")

# Rolling moyens (t-1): 1h, 2h
feat["occ_roll_60"]  = shift("occ", 1).rolling(12).mean().reset_index(level=0, drop=True).astype("float32")
feat["occ_roll_120"] = shift("occ", 1).rolling(24).mean().reset_index(level=0, drop=True).astype("float32")

# Deltas rapides
feat["occ_delta_5"]  = (feat["occ"] - shift("occ", 1)).astype("float32")
feat["occ_delta_15"] = (feat["occ"] - shift("occ", 3)).astype("float32")

# Cibles: occupation future
for h in HORIZONS:
    sh = h // FREQ_MIN
    feat[f"occ_{h}"] = feat.groupby("station_id")["occ"].shift(-sh).astype("float32")

# Split temporel Train/Test
cut = feat["ts"].quantile(SPLIT_TRAINTEST)
train = feat[feat["ts"] <= cut].copy()
test  = feat[feat["ts"] >  cut].copy()

# Encodages station calculés sur TRAIN (en ratio)
sta_mean = (train.groupby("station_id")["occ"].mean().rename("sta_mean_occ")).reset_index()
sta_hdh  = (train.assign(hour=train["ts"].dt.hour.astype("uint8"),
                         dow=train["ts"].dt.dayofweek.astype("uint8"))
                 .groupby(["station_id","dow","hour"])["occ"].median()
                 .rename("sta_hdh_occ")).reset_index()

def add_station_encodings(frame):
    out = frame.merge(sta_mean, on="station_id", how="left")
    out = out.merge(sta_hdh, left_on=["station_id","dow","hour"],
                           right_on=["station_id","dow","hour"], how="left")
    out["sta_hdh_occ"] = out["sta_hdh_occ"].fillna(out["sta_mean_occ"])
    return out

train = add_station_encodings(train).sort_values(["station_id","ts"]).reset_index(drop=True)
test  = add_station_encodings(test ).sort_values(["station_id","ts"]).reset_index(drop=True)

# Liste de features (pas de doublon)
feat_cols = [
    "dow","is_weekend","hour_sin","hour_cos",
    "occ_lag_5","occ_lag_10","occ_lag_15","occ_lag_30","occ_lag_60",
    "occ_roll_60","occ_roll_120",
    "occ_delta_5","occ_delta_15",
    "sta_mean_occ","sta_hdh_occ",
    "capacity"  # pour clip / reconstruction
]

# Drop NaN uniquement sur ce qui est nécessaire
needed = feat_cols + [f"occ_{h}" for h in HORIZONS]
train = train.dropna(subset=needed)
test  = test.dropna(subset=needed)

# Sanity
assert set(feat_cols).issubset(train.columns)
assert set(feat_cols).issubset(test.columns)
len(feat_cols), train[feat_cols].head(3)


  def shift(col, k): return feat.groupby("station_id")[col].shift(k)
  def shift(col, k): return feat.groupby("station_id")[col].shift(k)
  def shift(col, k): return feat.groupby("station_id")[col].shift(k)
  def shift(col, k): return feat.groupby("station_id")[col].shift(k)
  def shift(col, k): return feat.groupby("station_id")[col].shift(k)
  def shift(col, k): return feat.groupby("station_id")[col].shift(k)
  def shift(col, k): return feat.groupby("station_id")[col].shift(k)
  def shift(col, k): return feat.groupby("station_id")[col].shift(k)
  def shift(col, k): return feat.groupby("station_id")[col].shift(k)
  feat[f"occ_{h}"] = feat.groupby("station_id")["occ"].shift(-sh).astype("float32")
  feat[f"occ_{h}"] = feat.groupby("station_id")["occ"].shift(-sh).astype("float32")
  feat[f"occ_{h}"] = feat.groupby("station_id")["occ"].shift(-sh).astype("float32")
  sta_mean = (train.groupby("station_id")["occ"].mean().rename("sta_mean_occ")).reset_index()
  .groupby(["station_id","dow","h

(16,
     dow  is_weekend  hour_sin  hour_cos  occ_lag_5  occ_lag_10  occ_lag_15  \
 24    3           0 -0.965926  0.258819   0.703704    0.740741    0.703704   
 25    3           0 -0.965926  0.258819   0.777778    0.703704    0.740741   
 26    3           0 -0.965926  0.258819   0.740741    0.777778    0.703704   
 
     occ_lag_30  occ_lag_60  occ_roll_60  occ_roll_120  occ_delta_5  \
 24    0.666667    0.703704     0.685185      0.728395     0.074074   
 25    0.592593    0.703704     0.691358      0.725309    -0.037037   
 26    0.666667    0.703704     0.694444      0.717593     0.037037   
 
     occ_delta_15  sta_mean_occ  sta_hdh_occ  capacity  
 24      0.074074      0.574086     0.648148      27.0  
 25      0.000000      0.574086     0.648148      27.0  
 26      0.074074      0.574086     0.648148      27.0  )

In [7]:
# === PATCH: ajouter le niveau courant comme feature forte ===
train["occ_now"] = train["occ"].astype("float32")
test["occ_now"]  = test["occ"].astype("float32")

# mettre à jour la liste des features (ajoute 'occ_now', conserve 'capacity' pour le clip)
feat_cols = [
    "dow","is_weekend","hour_sin","hour_cos",
    "occ_now",                         # <-- ANCRE le modèle sur le niveau courant
    "occ_lag_5","occ_lag_10","occ_lag_15","occ_lag_30","occ_lag_60",
    "occ_roll_60","occ_roll_120",
    "occ_delta_5","occ_delta_15",
    "sta_mean_occ","sta_hdh_occ",
    "capacity"
]

# (sécurité) enlever toute ligne avec NaN sur ces features
train = train.dropna(subset=feat_cols + [f"occ_{h}" for h in HORIZONS])
test  = test.dropna(subset=feat_cols + [f"occ_{h}" for h in HORIZONS])

assert set(feat_cols).issubset(train.columns) and set(feat_cols).issubset(test.columns)
print("len(feat_cols) =", len(feat_cols))


len(feat_cols) = 17


In [9]:
# === 4) LightGBM (occupation) ===
# Si LightGBM manque : décommente la ligne ci-dessous
# !pip install lightgbm -q

import lightgbm as lgb
from sklearn.metrics import mean_absolute_error

def train_lgbm_occ(train_df, test_df, horizon, num_threads=4):
    y_future = f"occ_{horizon}"
    tr = train_df.sort_values(["station_id","ts"]).reset_index(drop=True)
    te = test_df .sort_values(["station_id","ts"]).reset_index(drop=True)

    # split train/val temporel
    cut_tr = tr["ts"].quantile(SPLIT_TRAINVAL)
    tr_tr  = tr[tr["ts"] <= cut_tr]
    tr_val = tr[tr["ts"] >  cut_tr]

    Xtr, ytr = tr_tr[feat_cols], tr_tr[y_future]
    Xval, yval = tr_val[feat_cols], tr_val[y_future]
    Xte,  yte  = te[feat_cols],  te[y_future]

    params = dict(objective="regression", metric="l1",
                  learning_rate=0.05, num_leaves=63, max_depth=-1,
                  feature_fraction=0.9, bagging_fraction=0.8, bagging_freq=1,
                  seed=42, verbosity=-1, num_threads=num_threads)

    dtrain = lgb.Dataset(Xtr, label=ytr)
    dval   = lgb.Dataset(Xval, label=yval, reference=dtrain)

    model = lgb.train(params, dtrain, num_boost_round=2000,
                      valid_sets=[dtrain, dval], valid_names=["train","val"],
                      callbacks=[lgb.early_stopping(stopping_rounds=80)])

    # prédiction ratio -> vélos + clip
    occ_hat = np.clip(model.predict(Xte, num_iteration=model.best_iteration), 0, 1)
    y_hat_bikes  = (occ_hat * Xte["capacity"].to_numpy())
    y_true_bikes = (yte     * Xte["capacity"].to_numpy())

    mae = mean_absolute_error(y_true_bikes, y_hat_bikes)
    params.update(dict(min_data_in_leaf=64, lambda_l1=1.0, lambda_l2=1.0, num_threads=4))
    return mae, model, int(model.best_iteration or 0)

mae_lgbm, iters, models = {}, {}, {}
for h in HORIZONS:
    mae, mdl, it = train_lgbm_occ(train, test, h, num_threads=4)
    mae_lgbm[h] = mae; iters[h] = it; models[h] = mdl

print("Best iters:", iters)
print("MAE LGBM (bikes):", {h: round(v,3) for h,v in mae_lgbm.items()})


Training until validation scores don't improve for 80 rounds
Did not meet early stopping. Best iteration is:
[1981]	train's l1: 0.0228366	val's l1: 0.0343757
Training until validation scores don't improve for 80 rounds
Did not meet early stopping. Best iteration is:
[1990]	train's l1: 0.0324185	val's l1: 0.0486582
Training until validation scores don't improve for 80 rounds
Did not meet early stopping. Best iteration is:
[2000]	train's l1: 0.0493486	val's l1: 0.0717778
Best iters: {15: 1981, 30: 1990, 60: 2000}
MAE LGBM (bikes): {15: 4.174, 30: 6.079, 60: 6.318}


In [10]:
# === 5) Comparatif + debug ===
import pandas as pd
comp = pd.DataFrame({
    "Horizon_min": HORIZONS,
    "MAE_Naive":   [mae_naive_bikes[h] for h in HORIZONS],
    "MAE_LGBM":    [mae_lgbm[h]       for h in HORIZONS],
})
print(comp.to_string(index=False, float_format=lambda x: f"{x:.3f}"))

# Debug 5 lignes (h=30)
h = 30
y_future = f"occ_{h}"
te_dbg = test.dropna(subset=feat_cols+[y_future]).sort_values(["station_id","ts"]).reset_index(drop=True)
mdl = models[h]
occ_hat = np.clip(mdl.predict(te_dbg[feat_cols], num_iteration=mdl.best_iteration), 0, 1)
y_hat = (occ_hat * te_dbg["capacity"].to_numpy())
y_true = (te_dbg[y_future] * te_dbg["capacity"]).to_numpy()

print("\nSample rows:")
for i in range(5):
    row = te_dbg.iloc[i][["ts","station_id","bikes_available","capacity"]].to_dict()
    print(row, " | y_true:", round(float(y_true[i]),2), " | y_hat:", round(float(y_hat[i]),2))


 Horizon_min  MAE_Naive  MAE_LGBM
          15      0.750     4.174
          30      1.156     6.079
          60      1.739     6.318

Sample rows:
{'ts': Timestamp('2025-08-28 23:15:00+0000', tz='UTC'), 'station_id': '1002059045', 'bikes_available': 5.0, 'capacity': 27.0}  | y_true: 4.0  | y_hat: 9.01
{'ts': Timestamp('2025-08-28 23:20:00+0000', tz='UTC'), 'station_id': '1002059045', 'bikes_available': 4.0, 'capacity': 27.0}  | y_true: 5.0  | y_hat: 8.23
{'ts': Timestamp('2025-08-28 23:25:00+0000', tz='UTC'), 'station_id': '1002059045', 'bikes_available': 5.0, 'capacity': 27.0}  | y_true: 4.0  | y_hat: 9.38
{'ts': Timestamp('2025-08-28 23:30:00+0000', tz='UTC'), 'station_id': '1002059045', 'bikes_available': 4.0, 'capacity': 27.0}  | y_true: 4.0  | y_hat: 8.25
{'ts': Timestamp('2025-08-28 23:35:00+0000', tz='UTC'), 'station_id': '1002059045', 'bikes_available': 4.0, 'capacity': 27.0}  | y_true: 4.0  | y_hat: 8.57


In [11]:
print("=== RÉSUMÉ RAPIDE ===")
print("Rows/Stations:", len(df), df["station_id"].nunique())
print("Capacity NaN:", int(df["capacity"].isna().sum()))
print("occ describe:", df["occ"].describe()[["min","mean","50%","max"]].to_dict())

print("\nMAE Naïve (bikes):", {h: round(mae_naive_bikes[h],3) for h in HORIZONS})

print("\nTrain/Test shapes:", train.shape, test.shape)
print("len(feat_cols):", len(feat_cols))
print("Top NaN features train:", train[feat_cols].isna().mean().sort_values(ascending=False).head(5).to_dict())

print("\nBest iters:", {h: int(iters[h]) for h in iters})
print("MAE LGBM (bikes):", {h: round(mae_lgbm[h],3) for h in HORIZONS})

import pandas as pd
h=30
fi = pd.Series(models[h].feature_importance(), index=feat_cols).sort_values(ascending=False).head(10)
print("\nTop10 importances @+30:", fi.to_dict())

# Debug 3 lignes pour compacité
y_future = f"occ_{h}"
te_dbg = test.dropna(subset=feat_cols+[y_future]).sort_values(["station_id","ts"]).reset_index(drop=True)
occ_hat = np.clip(models[h].predict(te_dbg[feat_cols], num_iteration=models[h].best_iteration), 0, 1)
y_hat = (occ_hat * te_dbg["capacity"].to_numpy())
y_true = (te_dbg[y_future] * te_dbg["capacity"]).to_numpy()
for i in range(3):
    row = te_dbg.iloc[i][["ts","station_id","bikes_available","capacity"]].to_dict()
    print("DBG", i, row, "| y_true:", round(float(y_true[i]),2), "| y_hat:", round(float(y_hat[i]),2))


=== RÉSUMÉ RAPIDE ===
Rows/Stations: 3304414 1468
Capacity NaN: 0
occ describe: {'min': 0.0, 'mean': 0.3883110284805298, '50%': 0.3214285671710968, 'max': 1.0}

MAE Naïve (bikes): {15: 0.75, 30: 1.156, 60: 1.739}

Train/Test shapes: (2278011, 25) (973555, 25)
len(feat_cols): 17
Top NaN features train: {'dow': 0.0, 'occ_lag_60': 0.0, 'sta_hdh_occ': 0.0, 'sta_mean_occ': 0.0, 'occ_delta_15': 0.0}

Best iters: {15: 1981, 30: 1990, 60: 2000}
MAE LGBM (bikes): {15: 4.174, 30: 6.079, 60: 6.318}

Top10 importances @+30: {'sta_hdh_occ': 15159, 'sta_mean_occ': 15100, 'occ_roll_120': 9932, 'capacity': 9531, 'occ_now': 8881, 'occ_lag_60': 7874, 'occ_lag_30': 7405, 'occ_roll_60': 6880, 'hour_sin': 6758, 'hour_cos': 6532}
DBG 0 {'ts': Timestamp('2025-08-28 23:15:00+0000', tz='UTC'), 'station_id': '1002059045', 'bikes_available': 5.0, 'capacity': 27.0} | y_true: 4.0 | y_hat: 9.01
DBG 1 {'ts': Timestamp('2025-08-28 23:20:00+0000', tz='UTC'), 'station_id': '1002059045', 'bikes_available': 4.0, 'capacit

In [12]:
# === A) Ajouter cibles Δ (future - now) et préparer les features ===
# On garde occ_now comme ancre, et on prédit Δ_occ_h = occ(t+h) - occ(t)
# On teste deux variantes: avec et sans encodages station (les encodages peuvent sur-lisser)

USE_STA_ENC = False  # commence par False ; si c'est mieux, on confirmera

# 1) Assurer le tri
train = train.sort_values(["station_id","ts"]).reset_index(drop=True)
test  = test.sort_values(["station_id","ts"]).reset_index(drop=True)

# 2) occ_now (si pas déjà créé)
if "occ_now" not in train.columns:
    train["occ_now"] = train["occ"].astype("float32")
    test["occ_now"]  = test["occ"].astype("float32")

# 3) Cibles Δ sur train et test (shift *dans chaque split* pour alignement propre)
for h in HORIZONS:
    sh = h // FREQ_MIN
    train[f"occ_delta_target_{h}"] = (train.groupby("station_id")["occ"].shift(-sh) - train["occ"]).astype("float32")
    test[f"occ_delta_target_{h}"]  = (test .groupby("station_id")["occ"].shift(-sh) - test ["occ"]).astype("float32")

# 4) Features pour Δ (on part de la même base + occ_now ; encodages station optionnels)
base_feats = [
    "dow","is_weekend","hour_sin","hour_cos",
    "occ_now",                                   # ANCRE
    "occ_lag_5","occ_lag_10","occ_lag_15","occ_lag_30","occ_lag_60",
    "occ_roll_60","occ_roll_120",
    "occ_delta_5","occ_delta_15",
]
sta_feats = ["sta_mean_occ","sta_hdh_occ"] if USE_STA_ENC else []
feat_cols_delta = base_feats + sta_feats + ["capacity"]  # on garde capacity pour le clip, même si la cible est un ratio

# 5) Nettoyage NaN minimal pour entraîner proprement
need_train = feat_cols_delta + [f"occ_delta_target_{h}" for h in HORIZONS]
need_test  = feat_cols_delta + [f"occ_delta_target_{h}" for h in HORIZONS]
train_delta = train.dropna(subset=need_train).copy()
test_delta  = test .dropna(subset=need_test ).copy()

print("USE_STA_ENC =", USE_STA_ENC)
print("train_delta/test_delta:", train_delta.shape, test_delta.shape)


  train[f"occ_delta_target_{h}"] = (train.groupby("station_id")["occ"].shift(-sh) - train["occ"]).astype("float32")
  test[f"occ_delta_target_{h}"]  = (test .groupby("station_id")["occ"].shift(-sh) - test ["occ"]).astype("float32")
  train[f"occ_delta_target_{h}"] = (train.groupby("station_id")["occ"].shift(-sh) - train["occ"]).astype("float32")
  test[f"occ_delta_target_{h}"]  = (test .groupby("station_id")["occ"].shift(-sh) - test ["occ"]).astype("float32")
  train[f"occ_delta_target_{h}"] = (train.groupby("station_id")["occ"].shift(-sh) - train["occ"]).astype("float32")
  test[f"occ_delta_target_{h}"]  = (test .groupby("station_id")["occ"].shift(-sh) - test ["occ"]).astype("float32")


USE_STA_ENC = False
train_delta/test_delta: (2260395, 28) (955951, 28)


In [13]:
# === B) Entraînement LightGBM Δ + reconstruction ===
# !pip install lightgbm -q
import lightgbm as lgb
import numpy as np
from sklearn.metrics import mean_absolute_error

def train_lgbm_delta(train_df, test_df, horizon, num_threads=4):
    y_col = f"occ_delta_target_{horizon}"
    tr = train_df.sort_values(["station_id","ts"]).reset_index(drop=True)
    te = test_df .sort_values(["station_id","ts"]).reset_index(drop=True)

    # Split temporel train/val dans TRAIN
    cut_tr = tr["ts"].quantile(0.85)
    tr_tr  = tr[tr["ts"] <= cut_tr]
    tr_val = tr[tr["ts"] >  cut_tr]

    Xtr, ytr = tr_tr[feat_cols_delta], tr_tr[y_col]
    Xval, yval = tr_val[feat_cols_delta], tr_val[y_col]
    Xte = te[feat_cols_delta]

    params = dict(
        objective="regression_l1",   # L1 direct sur la cible Δ
        metric="l1",
        learning_rate=0.05,
        num_leaves=63,
        max_depth=-1,
        feature_fraction=0.9,
        bagging_fraction=0.8,
        bagging_freq=1,
        min_data_in_leaf=64,
        lambda_l1=1.0, lambda_l2=1.0,
        seed=42, verbosity=-1, num_threads=num_threads
    )
    dtrain = lgb.Dataset(Xtr, label=ytr)
    dval   = lgb.Dataset(Xval, label=yval, reference=dtrain)

    model = lgb.train(params, dtrain, num_boost_round=2000,
                      valid_sets=[dtrain, dval], valid_names=["train","val"],
                      callbacks=[lgb.early_stopping(stopping_rounds=80)])

    # Reconstruction: occ_hat = occ_now + Δ̂ ; puis clip [0,1] et conversion vélos
    delta_pred = model.predict(Xte, num_iteration=model.best_iteration)
    occ_hat = np.clip(te["occ_now"].to_numpy() + delta_pred, 0, 1)
    y_hat_bikes  = occ_hat * te["capacity"].to_numpy()

    # Vérité terrain: occ_future * capacity (utilise la colonne déjà calculée à l'étape 3)
    y_true_bikes = (te[f"occ_{horizon}"] * te["capacity"]).to_numpy()

    mae = mean_absolute_error(y_true_bikes, y_hat_bikes)
    return mae, model, int(model.best_iteration or 0)

mae_lgbm_delta, iters_delta, models_delta = {}, {}, {}
for h in HORIZONS:
    mae, mdl, it = train_lgbm_delta(train_delta, test_delta, h, num_threads=4)
    mae_lgbm_delta[h] = mae; iters_delta[h] = it; models_delta[h] = mdl

print("Best iters (Δ):", iters_delta)
print("MAE LGBM (Δ->bikes):", {h: round(v,3) for h,v in mae_lgbm_delta.items()})


Training until validation scores don't improve for 80 rounds
Did not meet early stopping. Best iteration is:
[1946]	train's l1: 0.0261865	val's l1: 0.0340072
Training until validation scores don't improve for 80 rounds
Early stopping, best iteration is:
[1594]	train's l1: 0.0401459	val's l1: 0.0504111
Training until validation scores don't improve for 80 rounds
Did not meet early stopping. Best iteration is:
[2000]	train's l1: 0.0583256	val's l1: 0.0728631
Best iters (Δ): {15: 1946, 30: 1594, 60: 2000}
MAE LGBM (Δ->bikes): {15: 0.764, 30: 1.169, 60: 1.73}


In [14]:
# === C) Comparatif & Debug @+30 ===
import pandas as pd
comp = pd.DataFrame({
    "Horizon_min": HORIZONS,
    "MAE_Naive":   [mae_naive_bikes[h] for h in HORIZONS],
    "MAE_LGBM_Δ":  [mae_lgbm_delta[h]  for h in HORIZONS],
})
print(comp.to_string(index=False, float_format=lambda x: f"{x:.3f}"))

# Debug 5 lignes @+30
h = 30
te_dbg = test_delta.dropna(subset=feat_cols_delta+[f"occ_{h}", f"occ_delta_target_{h}"])\
                   .sort_values(["station_id","ts"]).reset_index(drop=True)
mdl = models_delta[h]
delta_pred = mdl.predict(te_dbg[feat_cols_delta], num_iteration=mdl.best_iteration)
occ_hat = np.clip(te_dbg["occ_now"].to_numpy() + delta_pred, 0, 1)
y_hat = occ_hat * te_dbg["capacity"].to_numpy()
y_true = te_dbg[f"occ_{h}"].to_numpy() * te_dbg["capacity"].to_numpy()
delta_true = te_dbg[f"occ_delta_target_{h}"].to_numpy()

for i in range(5):
    row = te_dbg.iloc[i][["ts","station_id","bikes_available","capacity"]].to_dict()
    print(row, " | y_true:", round(float(y_true[i]),2),
          " | y_hat:", round(float(y_hat[i]),2),
          " | Δ_true:", round(float(delta_true[i]),3),
          " | Δ_pred:", round(float(delta_pred[i]),3))


 Horizon_min  MAE_Naive  MAE_LGBM_Δ
          15      0.750       0.764
          30      1.156       1.169
          60      1.739       1.730
{'ts': Timestamp('2025-08-28 23:15:00+0000', tz='UTC'), 'station_id': '1002059045', 'bikes_available': 5.0, 'capacity': 27.0}  | y_true: 4.0  | y_hat: 5.0  | Δ_true: -0.037  | Δ_pred: 0.0
{'ts': Timestamp('2025-08-28 23:20:00+0000', tz='UTC'), 'station_id': '1002059045', 'bikes_available': 4.0, 'capacity': 27.0}  | y_true: 5.0  | y_hat: 4.0  | Δ_true: 0.037  | Δ_pred: -0.0
{'ts': Timestamp('2025-08-28 23:25:00+0000', tz='UTC'), 'station_id': '1002059045', 'bikes_available': 5.0, 'capacity': 27.0}  | y_true: 4.0  | y_hat: 5.0  | Δ_true: -0.037  | Δ_pred: -0.0
{'ts': Timestamp('2025-08-28 23:30:00+0000', tz='UTC'), 'station_id': '1002059045', 'bikes_available': 4.0, 'capacity': 27.0}  | y_true: 4.0  | y_hat: 4.0  | Δ_true: 0.0  | Δ_pred: -0.0
{'ts': Timestamp('2025-08-28 23:35:00+0000', tz='UTC'), 'station_id': '1002059045', 'bikes_available': 4.