In [5]:
import pandas as pd
import numpy as np

from sklearn.metrics import mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Ridge

import lightgbm as lgb

# =========================
# Config
# =========================
CONTROL_PATH = "../final_data/data_260125_control.csv"
RANDOM_PATH  = "../final_data/data_260125_random.csv"

TARGETS = ["trip_cnt_per_user", "nonrepeat_cnt_per_user"]
TEST_WEEKS = [pd.Timestamp("2025-12-15"), pd.Timestamp("2025-12-22")]

OPS_TREAT_RULES = {
    "14天在其他尖峰預估車資": [15, 20],
    "14天在晚尖峰預估車資":   [15, 20],
    "90天在尖峰預估車資":     [20, 30],
    "喚回-其他":              [20, 30],
    "喚回-高優惠敏感":        [20, 30],
    "既有regular鞏固":        [15, 20],
    "養成Regular-其他":       [20, 30],
    "養成Regular-高優惠敏感": [20, 30],
}
CITY_LIST = ["中區", "北區", "南區"]

# True: 15/20 也包含 15x2、20x2
# False: 只算單張 (15元1張、20元1張...)
INCLUDE_X2 = False

# =========================
# Helpers
# =========================
def rmse(y_true, y_pred, w=None):
    if w is None:
        return float(np.sqrt(mean_squared_error(y_true, y_pred)))
    return float(np.sqrt(mean_squared_error(y_true, y_pred, sample_weight=w)))

def ensure_datetime(df, col="experiment_date"):
    df = df.copy()
    df[col] = pd.to_datetime(df[col])
    return df

def build_random_filter():
    def treatment_match(treat_str: str, v: int) -> bool:
        if pd.isna(treat_str):
            return False
        s = str(treat_str)
        if INCLUDE_X2:
            return s.startswith(str(v))  # "15..." includes 15元1張 and 15x2元1張
        else:
            return s == f"{v}元1張"

    def f(df_test: pd.DataFrame) -> pd.Series:
        for c in ["city_group", "ops_type_merged", "treatment"]:
            if c not in df_test.columns:
                raise ValueError(f"random test set missing required column: {c}")

        def row_ok(row) -> bool:
            city = row["city_group"]
            ops  = row["ops_type_merged"]
            tr   = row["treatment"]
            if (city not in CITY_LIST) or (ops not in OPS_TREAT_RULES):
                return False
            for v in OPS_TREAT_RULES[ops]:
                if treatment_match(tr, v):
                    return True
            return False

        return df_test.apply(row_ok, axis=1)

    return f

# =========================
# Model setup
# =========================
FEATURE_COLS = [
    "treatment", "ops_type_merged", "city_group",
    "avg_rainy_day", "avg_rainy_weekday", "avg_rainy_weekend", "mgm_day",
    "trip_cnt_per_user_lag2", "trip_cnt_per_user_roll4",
    "nonrepeat_cnt_per_user_lag2", "nonrepeat_cnt_per_user_roll4",
]

CAT_COLS = ["treatment", "ops_type_merged", "city_group"]
NUM_COLS = [c for c in FEATURE_COLS if c not in CAT_COLS]

def make_ridge_pipeline(cat_cols, num_cols):
    pre = ColumnTransformer(
        transformers=[
            ("cat", Pipeline([
                ("imp", SimpleImputer(strategy="most_frequent")),
                ("ohe", OneHotEncoder(handle_unknown="ignore"))
            ]), cat_cols),
            ("num", Pipeline([
                ("imp", SimpleImputer(strategy="median", add_indicator=True))
            ]), num_cols),
        ],
        remainder="drop"
    )
    return Pipeline([
        ("pre", pre),
        ("model", Ridge(alpha=1.0, random_state=42))
    ])

def make_lgbm():
    return lgb.LGBMRegressor(
        objective="regression",
        learning_rate=0.06,
        num_leaves=63,
        min_data_in_leaf=30,
        feature_fraction=0.9,
        bagging_fraction=0.9,
        bagging_freq=1,
        reg_lambda=1.0,
        n_estimators=2500,
        random_state=42,
        n_jobs=-1
    )

# =========================
# Backtest (aggregate over 2 weeks)
# =========================
def run_backtest_aggregate(df, dataset_name, group_filter_fn=None):
    df = ensure_datetime(df, "experiment_date")

    # required columns
    for c in ["experiment_date", "user_cnt"] + TARGETS:
        if c not in df.columns:
            raise ValueError(f"{dataset_name} missing required column: {c}")

    feat_cols = [c for c in FEATURE_COLS if c in df.columns]
    cat_cols  = [c for c in CAT_COLS if c in feat_cols]
    num_cols  = [c for c in NUM_COLS if c in feat_cols]

    ridge = make_ridge_pipeline(cat_cols, num_cols)

    # 用來累積「兩週合併後」的 y/pred/weight
    agg = {
        t: {
            "y": [],
            "w": [],
            "pred_base": [],
            "pred_ridge": [],
            "pred_lgbm": [],
        } for t in TARGETS
    }

    # 也保留每週明細（可選）
    detail_rows = []

    for week in TEST_WEEKS:
        train = df[df["experiment_date"] < week].copy()
        test  = df[df["experiment_date"] == week].copy()

        if test.empty:
            print(f"[WARN] {dataset_name} no rows for test week {week.date()}")
            continue

        if group_filter_fn is not None:
            test = test[group_filter_fn(test)].copy()

        if test.empty:
            print(f"[WARN] {dataset_name} test week {week.date()} filtered to 0 rows.")
            continue

        w_tr = train["user_cnt"].values
        w_te = test["user_cnt"].values

        # 預測兩個 target（各自 fit 一次）
        for target in TARGETS:
            base_col = f"{target}_roll4"
            if base_col not in df.columns:
                raise ValueError(f"{dataset_name} missing baseline column: {base_col}")

            y_te = test[target].values
            pred_base = test[base_col].values

            # Ridge
            ridge.fit(train[feat_cols], train[target].values, model__sample_weight=w_tr)
            pred_ridge = ridge.predict(test[feat_cols])

            # LGBM（不放 user_cnt 特徵，只用 sample_weight）
            X_tr = train[feat_cols].copy()
            X_te = test[feat_cols].copy()
            for c in cat_cols:
                X_tr[c] = X_tr[c].astype("category")
                X_te[c] = X_te[c].astype("category")

            lgbm = make_lgbm()
            lgbm.fit(
                X_tr, train[target].values,
                sample_weight=w_tr,
                categorical_feature=cat_cols
            )
            pred_lgbm = lgbm.predict(X_te)

            # 累積到 agg（兩週合併）
            agg[target]["y"].append(y_te)
            agg[target]["w"].append(w_te)
            agg[target]["pred_base"].append(pred_base)
            agg[target]["pred_ridge"].append(pred_ridge)
            agg[target]["pred_lgbm"].append(pred_lgbm)

            # 每週明細（方便 debug）
            detail_rows.append({
                "dataset": dataset_name,
                "week": week.date().isoformat(),
                "target": target,
                "n_test_rows": len(test),
                "test_sum_user_cnt": float(w_te.sum()),
                "baseline_roll4_rmse": rmse(y_te, pred_base),
                "ridge_rmse": rmse(y_te, pred_ridge),
                "lgbm_rmse": rmse(y_te, pred_lgbm),
                "baseline_roll4_wrmse": rmse(y_te, pred_base, w=w_te),
                "ridge_wrmse": rmse(y_te, pred_ridge, w=w_te),
                "lgbm_wrmse": rmse(y_te, pred_lgbm, w=w_te),
            })

    # === 最終：把兩週合併後計算 RMSE ===
    summary_rows = []
    for target in TARGETS:
        if len(agg[target]["y"]) == 0:
            continue

        y  = np.concatenate(agg[target]["y"])
        w  = np.concatenate(agg[target]["w"])
        pb = np.concatenate(agg[target]["pred_base"])
        pr = np.concatenate(agg[target]["pred_ridge"])
        pl = np.concatenate(agg[target]["pred_lgbm"])

        summary_rows.append({
            "dataset": dataset_name,
            "weeks": "2025-12-15 + 2025-12-22",
            "target": target,
            "n_test_rows_total": int(len(y)),
            "test_sum_user_cnt_total": float(w.sum()),

            # 合併後 RMSE（你要的）
            "baseline_roll4_rmse": rmse(y, pb),
            "ridge_rmse": rmse(y, pr),
            "lgbm_rmse": rmse(y, pl),

            # 合併後 weighted RMSE（可一起看）
            "baseline_roll4_wrmse": rmse(y, pb, w=w),
            "ridge_wrmse": rmse(y, pr, w=w),
            "lgbm_wrmse": rmse(y, pl, w=w),
        })

    summary_df = pd.DataFrame(summary_rows)
    detail_df  = pd.DataFrame(detail_rows)
    return summary_df, detail_df

def main():
    control = pd.read_csv(CONTROL_PATH)
    random  = pd.read_csv(RANDOM_PATH)

    random_filter = build_random_filter()

    sum_c, det_c = run_backtest_aggregate(control, "control", group_filter_fn=None)
    sum_r, det_r = run_backtest_aggregate(random,  "random",  group_filter_fn=random_filter)

    print("\n=== SUMMARY (two weeks aggregated) : Control ===")
    print(sum_c.to_string(index=False))

    print("\n=== SUMMARY (two weeks aggregated) : Random (filtered) ===")
    print(sum_r.to_string(index=False))

    print("\n=== DETAIL (per week) : Control ===")
    print(det_c.sort_values(["week", "target"]).to_string(index=False))

    print("\n=== DETAIL (per week) : Random (filtered) ===")
    print(det_r.sort_values(["week", "target"]).to_string(index=False))

    # Save
    #sum_c.to_csv("bt_control_summary_1215_1222.csv", index=False, encoding="utf-8-sig")
    #sum_r.to_csv("bt_random_filtered_summary_1215_1222.csv", index=False, encoding="utf-8-sig")
    #det_c.to_csv("bt_control_detail_1215_1222.csv", index=False, encoding="utf-8-sig")
    #det_r.to_csv("bt_random_filtered_detail_1215_1222.csv", index=False, encoding="utf-8-sig")
    #print("\nSaved:")
    #print(" - bt_control_summary_1215_1222.csv")
    #print(" - bt_random_filtered_summary_1215_1222.csv")
    #print(" - bt_control_detail_1215_1222.csv")
    #print(" - bt_random_filtered_detail_1215_1222.csv")

if __name__ == "__main__":
    main()



=== SUMMARY (two weeks aggregated) : Control ===
dataset                   weeks                 target  n_test_rows_total  test_sum_user_cnt_total  baseline_roll4_rmse  ridge_rmse  lgbm_rmse  baseline_roll4_wrmse  ridge_wrmse  lgbm_wrmse
control 2025-12-15 + 2025-12-22      trip_cnt_per_user                 48                  59652.0             0.029146    0.026015   0.029606              0.022475     0.021091    0.018927
control 2025-12-15 + 2025-12-22 nonrepeat_cnt_per_user                 48                  59652.0             0.040344    0.037546   0.038322              0.031768     0.031509    0.025254

=== SUMMARY (two weeks aggregated) : Random (filtered) ===
dataset                   weeks                 target  n_test_rows_total  test_sum_user_cnt_total  baseline_roll4_rmse  ridge_rmse  lgbm_rmse  baseline_roll4_wrmse  ridge_wrmse  lgbm_wrmse
 random 2025-12-15 + 2025-12-22      trip_cnt_per_user                 96                  25424.0             0.051032    0.05552