In [7]:
import numpy as np
import pandas as pd

DATA_PATH = "../../final_data/data_260119_random.csv"
TARGET_WEEK = "2026-01-05"

GROUP_COLS = ["treatment", "source", "ops_type_merged", "city_group"]
DATE_COL = "experiment_date"
LOG_COL = "log1p_user_cnt"

df = pd.read_csv(DATA_PATH)

wk = df[df[DATE_COL].astype(str) == TARGET_WEEK].copy()
wk["user_cnt_recovered"] = np.expm1(wk[LOG_COL]).round().astype("Int64")

group_user_cnt = (
    wk.groupby(GROUP_COLS, dropna=False)["user_cnt_recovered"]
      .sum()
      .reset_index(name="total_user_cnt")
      .sort_values(["ops_type_merged", "treatment", "source", "city_group"], ascending=True)
)

# 一次印出全部
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", 0)

print(group_user_cnt)


    treatment source  ops_type_merged city_group  total_user_cnt
0     15x2元1張    隨機組     14天在其他尖峰預估車資         中區             190
1     15x2元1張    隨機組     14天在其他尖峰預估車資         北區             802
2     15x2元1張    隨機組     14天在其他尖峰預估車資         南區             202
24      15元1張    隨機組     14天在其他尖峰預估車資         中區             199
25      15元1張    隨機組     14天在其他尖峰預估車資         北區             833
26      15元1張    隨機組     14天在其他尖峰預估車資         南區             191
48    20x2元1張    隨機組     14天在其他尖峰預估車資         中區             190
49    20x2元1張    隨機組     14天在其他尖峰預估車資         北區             781
50    20x2元1張    隨機組     14天在其他尖峰預估車資         南區             179
72      20元1張    隨機組     14天在其他尖峰預估車資         中區             217
73      20元1張    隨機組     14天在其他尖峰預估車資         北區             751
74      20元1張    隨機組     14天在其他尖峰預估車資         南區             162
96    30x2元1張    隨機組     14天在其他尖峰預估車資         中區             198
97    30x2元1張    隨機組     14天在其他尖峰預估車資         北區             780
98    30x2元1張    隨機組     

In [None]:
import numpy as np
import pandas as pd

# =========================
# Config
# =========================
PATH = "../../result/random_metrics_by_city_group.csv"

CITY_COL_CANDIDATES   = ["city_group", "city", "region", "area"]
TARGET_COL_CANDIDATES = ["target", "y", "label"]
MODEL_COL_CANDIDATES  = ["model", "model_name", "algo"]
RMSE_COL_CANDIDATES   = ["rmse", "rmse_unweighted", "rmse_weighted", "rmse_weighted_by_user_cnt"]

TARGETS = ["trip_cnt_per_user", "nonrepeat_cnt_per_user"]  # 你要算的兩個Y
BASELINE_MODELS = ["lag1", "roll4", "baseline_lag1", "baseline_roll4"]
LGBM_MODELS = ["lgbm", "lightgbm"]

# =========================
# Helpers
# =========================
def pick_col(df, candidates, required=True):
    for c in candidates:
        if c in df.columns:
            return c
    if required:
        raise KeyError(f"Cannot find any of columns: {candidates}. Available: {list(df.columns)}")
    return None

def norm_str(x):
    return str(x).strip().lower()

# =========================
# Load
# =========================
df = pd.read_csv(PATH)

city_col   = pick_col(df, CITY_COL_CANDIDATES)
target_col = pick_col(df, TARGET_COL_CANDIDATES)
model_col  = pick_col(df, MODEL_COL_CANDIDATES)

# RMSE column: prefer exact "rmse" if exists, else pick first candidate present
rmse_col = pick_col(df, RMSE_COL_CANDIDATES)

# normalize string cols for matching
tmp = df.copy()
tmp["_city"]   = tmp[city_col]
tmp["_target"] = tmp[target_col].map(norm_str)
tmp["_model"]  = tmp[model_col].map(norm_str)

# keep only the two targets you care about
tmp = tmp[tmp["_target"].isin([t.lower() for t in TARGETS])].copy()

# =========================
# Identify best baseline & LGBM per (city, target)
# =========================
def is_baseline(m):
    m = norm_str(m)
    return any(b in m for b in [norm_str(x) for x in BASELINE_MODELS])

def is_lgbm(m):
    m = norm_str(m)
    return any(x in m for x in [norm_str(y) for y in LGBM_MODELS])

base = tmp[tmp["_model"].map(is_baseline)].copy()
lgbm = tmp[tmp["_model"].map(is_lgbm)].copy()

# best baseline RMSE (min) per (city, target)
base_best = (
    base.sort_values(rmse_col, ascending=True)
        .groupby(["_city", "_target"], as_index=False)
        .first()[["_city", "_target", model_col, rmse_col]]
        .rename(columns={model_col: "best_baseline_model", rmse_col: "best_baseline_rmse"})
)

# best lgbm RMSE (min) per (city, target) — in case you have multiple lgbm variants
lgbm_best = (
    lgbm.sort_values(rmse_col, ascending=True)
        .groupby(["_city", "_target"], as_index=False)
        .first()[["_city", "_target", model_col, rmse_col]]
        .rename(columns={model_col: "best_lgbm_model", rmse_col: "lgbm_rmse"})
)

out = base_best.merge(lgbm_best, on=["_city", "_target"], how="inner")

# improvement ratio: (baseline - lgbm) / baseline
out["improvement_ratio"] = (out["best_baseline_rmse"] - out["lgbm_rmse"]) / out["best_baseline_rmse"]
out["improvement_pct"] = out["improvement_ratio"] * 100

# prettier target names + sort
out = out.rename(columns={"_city": "city_group", "_target": "target"})
out["target"] = out["target"].map(lambda x: x)  # keep as-is (already lower); optional mapping back if needed
out = out.sort_values(["target", "city_group"]).reset_index(drop=True)

print(out)

# 如果要另存結果
# out.to_csv("/mnt/data/lgbm_vs_best_baseline_improvement_by_city_group.csv", index=False, encoding="utf-8-sig")
