In [7]:
import numpy as np
import pandas as pd

DATA_PATH = "../../final_data/data_260119_random.csv"
TARGET_WEEK = "2026-01-05"

GROUP_COLS = ["treatment", "source", "ops_type_merged", "city_group"]
DATE_COL = "experiment_date"
LOG_COL = "log1p_user_cnt"

df = pd.read_csv(DATA_PATH)

wk = df[df[DATE_COL].astype(str) == TARGET_WEEK].copy()
wk["user_cnt_recovered"] = np.expm1(wk[LOG_COL]).round().astype("Int64")

group_user_cnt = (
    wk.groupby(GROUP_COLS, dropna=False)["user_cnt_recovered"]
      .sum()
      .reset_index(name="total_user_cnt")
      .sort_values(["ops_type_merged", "treatment", "source", "city_group"], ascending=True)
)

# 一次印出全部
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", 0)

print(group_user_cnt)


    treatment source  ops_type_merged city_group  total_user_cnt
0     15x2元1張    隨機組     14天在其他尖峰預估車資         中區             190
1     15x2元1張    隨機組     14天在其他尖峰預估車資         北區             802
2     15x2元1張    隨機組     14天在其他尖峰預估車資         南區             202
24      15元1張    隨機組     14天在其他尖峰預估車資         中區             199
25      15元1張    隨機組     14天在其他尖峰預估車資         北區             833
26      15元1張    隨機組     14天在其他尖峰預估車資         南區             191
48    20x2元1張    隨機組     14天在其他尖峰預估車資         中區             190
49    20x2元1張    隨機組     14天在其他尖峰預估車資         北區             781
50    20x2元1張    隨機組     14天在其他尖峰預估車資         南區             179
72      20元1張    隨機組     14天在其他尖峰預估車資         中區             217
73      20元1張    隨機組     14天在其他尖峰預估車資         北區             751
74      20元1張    隨機組     14天在其他尖峰預估車資         南區             162
96    30x2元1張    隨機組     14天在其他尖峰預估車資         中區             198
97    30x2元1張    隨機組     14天在其他尖峰預估車資         北區             780
98    30x2元1張    隨機組     

In [4]:
import pandas as pd
import numpy as np

PATH = "../../result/random/random_metrics_by_city_group.csv"

# 讀資料
df = pd.read_csv(PATH)

# 修正你打錯的 target（如果資料裡真的有這種拼字就一起修）
df["target"] = df["target"].replace({
    "nonrepeat_ccnt_per_user": "nonrepeat_cnt_per_user"
})

# 只留你要的兩個 target
targets = ["trip_cnt_per_user", "nonrepeat_cnt_per_user"]
df = df[df["target"].isin(targets)].copy()

# 定義 baseline / lgbm
baseline_models = ["baseline_lag1", "baseline_roll4"]
lgbm_models = ["lgbm_global"]   # 你的檔案裡是這個名字

# 取各 city_group、各 target 的「最佳 baseline（RMSE 最小）」
baseline_best = (
    df[df["model"].isin(baseline_models)]
      .sort_values(["city_group", "target", "RMSE"], ascending=[True, True, True])
      .groupby(["city_group", "target"], as_index=False)
      .first()
      .rename(columns={"model": "best_baseline_model", "RMSE": "best_baseline_rmse"})
      [["city_group", "target", "best_baseline_model", "best_baseline_rmse"]]
)

# 取各 city_group、各 target 的 LGBM RMSE
lgbm = (
    df[df["model"].isin(lgbm_models)]
      .rename(columns={"model": "lgbm_model", "RMSE": "lgbm_rmse"})
      [["city_group", "target", "lgbm_model", "lgbm_rmse"]]
)

# 合併並算「比(較好的)baseline好多少比例」
out = baseline_best.merge(lgbm, on=["city_group", "target"], how="inner")
out["improve_ratio"] = (out["best_baseline_rmse"] - out["lgbm_rmse"]) / out["best_baseline_rmse"]
out["improve_pct"] = out["improve_ratio"] * 100

# 排序輸出
out = out.sort_values(["target", "city_group"]).reset_index(drop=True)

print(out)

# 如要存檔：
out.to_csv("../../result/random/baseline_improvement_by_city_group.csv", index=False, encoding="utf-8-sig")


  city_group                  target best_baseline_model  best_baseline_rmse  \
0         中區  nonrepeat_cnt_per_user      baseline_roll4            0.073022   
1         北區  nonrepeat_cnt_per_user      baseline_roll4            0.071563   
2         南區  nonrepeat_cnt_per_user      baseline_roll4            0.051918   
3         中區       trip_cnt_per_user      baseline_roll4            0.056941   
4         北區       trip_cnt_per_user      baseline_roll4            0.035799   
5         南區       trip_cnt_per_user      baseline_roll4            0.040872   

    lgbm_model  lgbm_rmse  improve_ratio  improve_pct  
0  lgbm_global   0.058666       0.196607    19.660661  
1  lgbm_global   0.027540       0.615167    61.516724  
2  lgbm_global   0.044562       0.141679    14.167875  
3  lgbm_global   0.045234       0.205586    20.558639  
4  lgbm_global   0.021864       0.389268    38.926798  
5  lgbm_global   0.038695       0.053268     5.326814  


In [6]:
import os
import numpy as np
import pandas as pd

# =========================
# Config
# =========================
INPUT_FILES = [
    "../../result/random/random_metrics_by_city_group.csv",
    "../../result/random/random_metrics_by_ops_type_merged.csv",
    "../../result/random/random_metrics_by_treatment.csv",
]

OUT_DIR = "../../result/random"   # 你指定的路徑
os.makedirs(OUT_DIR, exist_ok=True)

TARGETS = ["trip_cnt_per_user", "nonrepeat_cnt_per_user"]
BASELINE_MODELS = ["baseline_lag1", "baseline_roll4"]
LGBM_MODEL = "lgbm_global"

# =========================
# Main
# =========================
for path in INPUT_FILES:
    df = pd.read_csv(path)

    # 找到「分組欄位」：這三份檔案都是第一欄（city_group / ops_type_merged / treatment）
    group_col = df.columns[0]

    # 修正可能的拼字錯誤（保險）
    df["target"] = df["target"].replace({
        "nonrepeat_ccnt_per_user": "nonrepeat_cnt_per_user"
    })

    df = df[df["target"].isin(TARGETS)].copy()

    # best baseline（RMSE 最小）
    baseline_best = (
        df[df["model"].isin(BASELINE_MODELS)]
          .sort_values([group_col, "target", "RMSE"], ascending=[True, True, True])
          .groupby([group_col, "target"], as_index=False)
          .first()
          .rename(columns={"model": "best_baseline_model", "RMSE": "best_baseline_rmse"})
          [[group_col, "target", "best_baseline_model", "best_baseline_rmse"]]
    )

    # lgbm rmse
    lgbm = (
        df[df["model"] == LGBM_MODEL]
          .rename(columns={"model": "lgbm_model", "RMSE": "lgbm_rmse"})
          [[group_col, "target", "lgbm_model", "lgbm_rmse"]]
    )

    out = baseline_best.merge(lgbm, on=[group_col, "target"], how="inner")
    out["improve_ratio"] = (out["best_baseline_rmse"] - out["lgbm_rmse"]) / out["best_baseline_rmse"]
    out["improve_pct"] = out["improve_ratio"] * 100

    out = out.sort_values(["target", group_col]).reset_index(drop=True)

    # 存檔
    out_name = f"baseline_improvement_by_{group_col}.csv"
    out_path = os.path.join(OUT_DIR, out_name)
    out.to_csv(out_path, index=False, encoding="utf-8-sig")

    print(f"[Saved] {out_path}  | rows={len(out)}")


[Saved] ../../result/random\baseline_improvement_by_city_group.csv  | rows=6
[Saved] ../../result/random\baseline_improvement_by_ops_type_merged.csv  | rows=16
[Saved] ../../result/random\baseline_improvement_by_treatment.csv  | rows=16
