In [18]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, mean_absolute_error

# --------------------------
# Config
# --------------------------
DATA_PATH = "../../final_data/data_260125_control.csv"
DATE_COL = "experiment_date"

TARGET_WEEKS = [pd.Timestamp("2025-12-15"), pd.Timestamp("2025-12-22")]

Y_COLS = ["nonrepeat_cnt_per_user", "trip_cnt_per_user"]

CAT_COLS = ["ops_type_merged", "city_group"]

NUM_FEATURES = [
    "avg_rainy_day",
    "avg_rainy_weekday",
    "avg_rainy_weekend",
    "mgm_day",
    "nonrepeat_cnt_per_user_lag2",
    "nonrepeat_cnt_per_user_roll4",
    "trip_cnt_per_user_lag2",
    "trip_cnt_per_user_roll4",
    "weekday_nonrepeat_cnt_per_user_lag2",
    "weekday_nonrepeat_cnt_per_user_roll4",
    "weekday_trip_cnt_per_user_lag2",
    "weekday_trip_cnt_per_user_roll4",
    "weekday_match_rate_lag2",
    "weekday_match_rate_roll4",
    "weekend_nonrepeat_cnt_per_user_lag2",
    "weekend_nonrepeat_cnt_per_user_roll4",
    "weekend_trip_cnt_per_user_lag2",
    "weekend_trip_cnt_per_user_roll4",
    "weekend_match_rate_lag2",
    "weekend_match_rate_roll4",
    "has_national_holiday",
    "coupon_BD_per_user_log1p_lag2",
    "coupon_BD_per_user_log1p_roll4",
    "coupon_CDP_per_user_log1p_lag2",
    "coupon_CDP_per_user_log1p_roll4",
    "coupon_folk_per_user_log1p_lag2",
    "coupon_folk_per_user_log1p_roll4",
    "coupon_growth_other_per_user_log1p_lag2",
    "coupon_growth_other_per_user_log1p_roll4",
    "coupon_MGM_per_user_log1p_lag2",
    "coupon_MGM_per_user_log1p_roll4",
    "coupon_MKT_per_user_log1p_lag2",
    "coupon_MKT_per_user_log1p_roll4",
    "coupon_register_per_user_log1p_lag2",
    "coupon_register_per_user_log1p_roll4",
    "coupon_daily_per_user_log1p_lag2",
    "coupon_daily_per_user_log1p_roll4",
    "delta_trip_per_user",
    "delta_nonrepeat_per_user",
]

# --------------------------
# Load
# --------------------------
df = pd.read_csv(DATA_PATH)
df[DATE_COL] = pd.to_datetime(df[DATE_COL], errors="coerce")
df = df[df[DATE_COL].notna()].copy()

# 只保留到最大 target week 為止（後面不會用到）
df = df[df[DATE_COL] <= max(TARGET_WEEKS)].copy()

# categorical cast
for c in CAT_COLS:
    if c in df.columns:
        df[c] = df[c].astype("category")

# feature availability check
missing_num = [c for c in NUM_FEATURES if c not in df.columns]
missing_cat = [c for c in CAT_COLS if c not in df.columns]
if missing_cat:
    print("[WARN] Missing categorical cols (will be dropped):", missing_cat)
if missing_num:
    print("[WARN] Missing numeric features (will be dropped):", missing_num)

X_COLS = [c for c in CAT_COLS if c in df.columns] + [c for c in NUM_FEATURES if c in df.columns]
CAT_IN_X = [c for c in CAT_COLS if c in df.columns]

# --------------------------
# Helpers
# --------------------------
def rmse(y_true, y_pred) -> float:
    return float(np.sqrt(mean_squared_error(y_true, y_pred)))

def mae(y_true, y_pred) -> float:
    return float(mean_absolute_error(y_true, y_pred))

def fit_predict_one_week(df_all: pd.DataFrame, target_week: pd.Timestamp, y_col: str):
    # 嚴格遵守：只能拿 target_week 之前的資料訓練
    train_df = df_all[df_all[DATE_COL] < target_week].copy()
    test_df  = df_all[df_all[DATE_COL] == target_week].copy()

    # drop rows with missing target
    train_df = train_df[train_df[y_col].notna()].copy()
    test_df  = test_df[test_df[y_col].notna()].copy()

    if len(train_df) == 0 or len(test_df) == 0:
        return None, None, len(train_df), len(test_df)

    X_train = train_df[X_COLS]
    y_train = train_df[y_col].astype(float)

    X_test = test_df[X_COLS]
    y_test = test_df[y_col].astype(float)

    model = lgb.LGBMRegressor(
        objective="regression",
        n_estimators=3600,
        learning_rate=0.05,
        num_leaves=35,
        min_child_samples=20,
        subsample=1.0,
        colsample_bytree=0.7,
        bagging_freq=0,
        reg_alpha=0.4,
        reg_lambda=7.0,
        random_state=42,
        n_jobs=-1,
        verbose=-1,
    )

    # categorical_feature: 用欄名即可
    model.fit(X_train, y_train, categorical_feature=[c for c in CAT_IN_X if c in X_train.columns])

    y_pred = model.predict(X_test)
    return y_test.values, y_pred, len(train_df), len(test_df)

# --------------------------
# Run + Print metrics
# --------------------------
for y_col in Y_COLS:
    print(f"\n==================== {y_col} ====================")
    all_y_true = []
    all_y_pred = []

    for tw in TARGET_WEEKS:
        y_true, y_pred, n_tr, n_te = fit_predict_one_week(df, tw, y_col)
        if y_true is None:
            print(f"[{tw.date()}] SKIP (train_rows={n_tr:,}, test_rows={n_te:,})")
            continue

        r = rmse(y_true, y_pred)
        m = mae(y_true, y_pred)
        print(f"[{tw.date()}] train_rows={n_tr:,} test_rows={n_te:,}  RMSE={r:.6f}  MAE={m:.6f}")

        all_y_true.append(y_true)
        all_y_pred.append(y_pred)

    if all_y_true:
        all_y_true = np.concatenate(all_y_true)
        all_y_pred = np.concatenate(all_y_pred)
        print(f"[Overall 2 weeks] test_rows={len(all_y_true):,}  RMSE={rmse(all_y_true, all_y_pred):.6f}  MAE={mae(all_y_true, all_y_pred):.6f}")



[2025-12-15] train_rows=456 test_rows=24  RMSE=0.026213  MAE=0.017293
[2025-12-22] train_rows=480 test_rows=24  RMSE=0.044824  MAE=0.031992
[Overall 2 weeks] test_rows=48  RMSE=0.036718  MAE=0.024643

[2025-12-15] train_rows=456 test_rows=24  RMSE=0.017717  MAE=0.013171
[2025-12-22] train_rows=480 test_rows=24  RMSE=0.031629  MAE=0.023037
[Overall 2 weeks] test_rows=48  RMSE=0.025635  MAE=0.018104


In [None]:
# ==========================
# Optuna tuning for CONTROL
# - Tune separately for each target in Y_COLS
# - Objective: avg RMSE over 2025-12-15 & 2025-12-22
# - Training constraint: ONLY use data strictly before each target week
# - No saving files, just print best + per-week metrics
# ==========================

import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)

# ---- baseline params (center of search)
BASE_PARAMS = dict(
    objective="regression",
    n_estimators=3000,
    learning_rate=0.05,
    num_leaves=31,
    min_child_samples=80,
    subsample=0.9,
    colsample_bytree=0.8,
    bagging_freq=1,
    reg_alpha=0.0,
    reg_lambda=2.0,
    random_state=42,
    n_jobs=-1,
    verbose=-1,
)

def fit_predict_one_week_with_params(df_all: pd.DataFrame, target_week: pd.Timestamp, y_col: str, model_params: dict):
    train_df = df_all[df_all[DATE_COL] < target_week].copy()
    test_df  = df_all[df_all[DATE_COL] == target_week].copy()

    train_df = train_df[train_df[y_col].notna()].copy()
    test_df  = test_df[test_df[y_col].notna()].copy()

    if len(train_df) == 0 or len(test_df) == 0:
        return None, None, len(train_df), len(test_df)

    X_train = train_df[X_COLS]
    y_train = train_df[y_col].astype(float)

    X_test = test_df[X_COLS]
    y_test = test_df[y_col].astype(float)

    model = lgb.LGBMRegressor(**model_params)
    model.fit(
        X_train, y_train,
        categorical_feature=[c for c in CAT_IN_X if c in X_train.columns],
    )
    y_pred = model.predict(X_test)
    return y_test.values, y_pred, len(train_df), len(test_df)

def tune_one_target(y_col: str, n_trials: int = 60):
    def objective(trial: optuna.Trial) -> float:
        params = dict(BASE_PARAMS)

        # --- search space near your baseline (and sensible)
        params["learning_rate"] = trial.suggest_float("learning_rate", 0.02, 0.08, log=True)
        params["n_estimators"]  = trial.suggest_int("n_estimators", 1200, 4500, step=300)

        params["num_leaves"] = trial.suggest_int("num_leaves", 15, 127, step=4)
        params["min_child_samples"] = trial.suggest_int("min_child_samples", 20, 200, step=10)

        params["subsample"] = trial.suggest_float("subsample", 0.65, 1.0, step=0.05)
        params["colsample_bytree"] = trial.suggest_float("colsample_bytree", 0.6, 1.0, step=0.05)

        # bagging_freq: keep small (0/1/2/5) - your baseline uses 1
        params["bagging_freq"] = trial.suggest_categorical("bagging_freq", [0, 1, 2, 5])

        # regularization near baseline
        params["reg_alpha"] = trial.suggest_float("reg_alpha", 0.0, 3.0)
        params["reg_lambda"] = trial.suggest_float("reg_lambda", 0.0, 8.0)

        rmses = []
        for i, tw in enumerate(TARGET_WEEKS, start=1):
            y_true, y_pred, n_tr, n_te = fit_predict_one_week_with_params(df, tw, y_col, params)
            if y_true is None or len(y_true) == 0:
                return 1e9
            rmses.append(rmse(y_true, y_pred))

            trial.report(float(np.mean(rmses)), step=i)
            if trial.should_prune():
                raise optuna.TrialPruned()

        return float(np.mean(rmses))

    sampler = optuna.samplers.TPESampler(seed=42)
    pruner = optuna.pruners.MedianPruner(n_startup_trials=10, n_warmup_steps=0)
    study = optuna.create_study(direction="minimize", sampler=sampler, pruner=pruner)

    study.optimize(objective, n_trials=n_trials, show_progress_bar=True)

    print(f"\n===== Optuna Best ({y_col}) =====")
    print("Best RMSE (avg over 2 weeks):", study.best_value)
    print("Best params:", study.best_params)

    best_params = dict(BASE_PARAMS)
    best_params.update(study.best_params)

    # ---- evaluate best params with RMSE + MAE per-week + overall
    print(f"\n==================== {y_col} (BEST params) ====================")
    all_y_true, all_y_pred = [], []
    for tw in TARGET_WEEKS:
        y_true, y_pred, n_tr, n_te = fit_predict_one_week_with_params(df, tw, y_col, best_params)
        if y_true is None:
            print(f"[{tw.date()}] SKIP (train_rows={n_tr:,}, test_rows={n_te:,})")
            continue
        print(f"[{tw.date()}] train_rows={n_tr:,} test_rows={n_te:,} RMSE={rmse(y_true, y_pred):.6f} MAE={mae(y_true, y_pred):.6f}")
        all_y_true.append(y_true)
        all_y_pred.append(y_pred)

    if all_y_true:
        all_y_true = np.concatenate(all_y_true)
        all_y_pred = np.concatenate(all_y_pred)
        print(f"[Overall 2 weeks] test_rows={len(all_y_true):,} RMSE={rmse(all_y_true, all_y_pred):.6f} MAE={mae(all_y_true, all_y_pred):.6f}")

    return study


# --------------------------
# Run tuning separately for two targets
# --------------------------
study_trip = tune_one_target("trip_cnt_per_user", n_trials=60)
study_nonrepeat = tune_one_target("nonrepeat_cnt_per_user", n_trials=60)


  0%|          | 0/60 [00:00<?, ?it/s]


===== Optuna Best (trip_cnt_per_user) =====
Best RMSE (avg over 2 weeks): 0.0246632502224929
Best params: {'learning_rate': 0.06737402776597771, 'n_estimators': 3600, 'num_leaves': 35, 'min_child_samples': 20, 'subsample': 1.0, 'colsample_bytree': 0.7, 'bagging_freq': 0, 'reg_alpha': 0.3627746654415558, 'reg_lambda': 7.1919990636928235}

[2025-12-15] train_rows=456 test_rows=24 RMSE=0.018084 MAE=0.012919
[2025-12-22] train_rows=480 test_rows=24 RMSE=0.031242 MAE=0.022657
[Overall 2 weeks] test_rows=48 RMSE=0.025526 MAE=0.017788


  0%|          | 0/60 [00:00<?, ?it/s]


===== Optuna Best (nonrepeat_cnt_per_user) =====
Best RMSE (avg over 2 weeks): 0.034817604824950235
Best params: {'learning_rate': 0.035706588601484346, 'n_estimators': 1800, 'num_leaves': 55, 'min_child_samples': 60, 'subsample': 0.75, 'colsample_bytree': 0.65, 'bagging_freq': 2, 'reg_alpha': 1.0707585889530296, 'reg_lambda': 6.881170955936629}

[2025-12-15] train_rows=456 test_rows=24 RMSE=0.024273 MAE=0.015087
[2025-12-22] train_rows=480 test_rows=24 RMSE=0.045362 MAE=0.032614
[Overall 2 weeks] test_rows=48 RMSE=0.036379 MAE=0.023850
