In [2]:
import pandas as pd, numpy as np, os, math

df = pd.read_csv("../../result/control/control_predictions_full.csv")

def rmse(y, p):
    y = np.asarray(y, dtype=float)
    p = np.asarray(p, dtype=float)
    return float(np.sqrt(np.mean((y - p) ** 2)))

def build_improve_table(df, segment_col):
    rows=[]
    # mapping per target
    targets = [
        ("trip_cnt_per_user", {"baseline_lag1":"pred_y1_lag1","baseline_roll4":"pred_y1_roll4","lgbm_global":"pred_y1_lgbm"}),
        ("nonrepeat_cnt_per_user", {"baseline_lag1":"pred_y2_lag1","baseline_roll4":"pred_y2_roll4","lgbm_global":"pred_y2_lgbm"}),
    ]
    for seg_val, g in df.groupby(segment_col, dropna=False):
        for y_col, preds in targets:
            g2 = g.dropna(subset=[y_col] + list(preds.values()))
            if g2.empty:
                continue
            y = g2[y_col].astype(float).values
            
            # baselines
            baseline_rmses = {}
            for bname in ["baseline_lag1","baseline_roll4"]:
                baseline_rmses[bname]=rmse(y, g2[preds[bname]].astype(float).values)
            best_baseline_model = min(baseline_rmses, key=baseline_rmses.get)
            best_baseline_rmse = baseline_rmses[best_baseline_model]
            
            lgbm_rmse = rmse(y, g2[preds["lgbm_global"]].astype(float).values)
            improve_ratio = (best_baseline_rmse - lgbm_rmse) / best_baseline_rmse if best_baseline_rmse!=0 else np.nan
            rows.append({
                segment_col: seg_val,
                "target": y_col,
                "best_baseline_model": best_baseline_model,
                "best_baseline_rmse": best_baseline_rmse,
                "lgbm_model": "lgbm_global",
                "lgbm_rmse": lgbm_rmse,
                "improve_ratio": improve_ratio,
                "improve_pct": improve_ratio*100
            })
    out = pd.DataFrame(rows)
    # order
    out = out[[segment_col,"target","best_baseline_model","best_baseline_rmse","lgbm_model","lgbm_rmse","improve_ratio","improve_pct"]]
    out = out.sort_values([segment_col,"target"]).reset_index(drop=True)
    return out

city_tbl = build_improve_table(df, "city_group")
ops_tbl = build_improve_table(df, "ops_type_merged")

city_path="../../result/control/control_improve_by_city_group.csv"
ops_path="../../result/control/control_improve_by_ops_type_merged.csv"
city_tbl.to_csv(city_path, index=False, encoding="utf-8-sig")
ops_tbl.to_csv(ops_path, index=False, encoding="utf-8-sig")

(city_tbl, ops_tbl.head(10), city_path, ops_path)


(  city_group                  target best_baseline_model  best_baseline_rmse  \
 0         中區  nonrepeat_cnt_per_user      baseline_roll4            0.043242   
 1         中區       trip_cnt_per_user      baseline_roll4            0.025663   
 2         北區  nonrepeat_cnt_per_user      baseline_roll4            0.067262   
 3         北區       trip_cnt_per_user      baseline_roll4            0.030978   
 4         南區  nonrepeat_cnt_per_user      baseline_roll4            0.042916   
 5         南區       trip_cnt_per_user      baseline_roll4            0.023010   
 
     lgbm_model  lgbm_rmse  improve_ratio  improve_pct  
 0  lgbm_global   0.035365       0.182158    18.215837  
 1  lgbm_global   0.025249       0.016127     1.612693  
 2  lgbm_global   0.035568       0.471204    47.120433  
 3  lgbm_global   0.024554       0.207375    20.737497  
 4  lgbm_global   0.035350       0.176280    17.627976  
 5  lgbm_global   0.026032      -0.131356   -13.135634  ,
   ops_type_merged             