# Evaluating Decision Rules Across Many Weak Experiments
[リンク](https://arxiv.org/pdf/2502.08763v2)

In [1]:
import numpy as np
import pandas as pd
import random

In [7]:
random.seed(0)
np.random.seed(0)

# Simulation parameters: AとBに本当は差がないものとしてシミュレーションする
n_exp = 500            # number of experiments
users_per_arm = 100    # users per arm (A:100, B:100)
p_A = 0.10             # true conversion rate A
p_B = 0.10             # true conversion rate B  (no true uplift to emphasise bias)

records = []
for i in range(1, n_exp + 1):
    # outcomes for arm A
    outcomes_A = np.random.binomial(1, p_A, users_per_arm)
    # outcomes for arm B
    outcomes_B = np.random.binomial(1, p_B, users_per_arm)
    for o in outcomes_A:
        records.append({"exp": i, "arm": "A", "outcome": o})
    for o in outcomes_B:
        records.append({"exp": i, "arm": "B", "outcome": o})

df_big = pd.DataFrame(records)
print('armはABテストのAorBを指す')
print('expはABテストのテスト番号をを指す')
print('outcomeはABテストの対象KPI（CVRなど）をを指す')
display(df_big.groupby(['exp', 'arm'])[['outcome']].mean())

armはABテストのAorBを指す
expはABテストのテスト番号をを指す
outcomeはABテストの対象KPI（CVRなど）をを指す


Unnamed: 0_level_0,Unnamed: 1_level_0,outcome
exp,arm,Unnamed: 2_level_1
1,A,0.09
1,B,0.09
2,A,0.14
2,B,0.09
3,A,0.16
...,...,...
498,B,0.10
499,A,0.09
499,B,0.06
500,A,0.15


In [15]:
# Naïve estimator over all experiments
naive_success = 0
naive_units = 0

for i, g in df_big.groupby("exp"): # i: expの番号、g: expごとのグループ
    mean_A = g.loc[g["arm"] == "A", "outcome"].mean()
    mean_B = g.loc[g["arm"] == "B", "outcome"].mean()
    winner = "A" if mean_A >= mean_B else "B"
    win_rows = g[g["arm"] == winner]
    naive_success += win_rows["outcome"].sum()
    naive_units += len(win_rows)

naive_rate = naive_success / naive_units
print('ABテストで勝った方のoutcome平均値')
naive_rate

ABテストで勝った方のoutcome平均値


np.float64(0.11722)

In [16]:
# 2-fold CV estimator
cv_success = 0
cv_units = 0

for i, g in df_big.groupby("exp"):
    # 各実験のデータ(200ユーザー、ABそれぞれ100)をシャッフルして、上100行、下100行に分ける
    shuffled = g.sample(frac=1, random_state=0).reset_index(drop=True)
    foldA = shuffled.iloc[:users_per_arm]   # first 100 rows
    foldB = shuffled.iloc[users_per_arm:]   # second 100 rows

    for train_fold, eval_fold in [(foldA, foldB), (foldB, foldA)]:
        mean_A = train_fold.loc[train_fold["arm"] == "A", "outcome"].mean()
        mean_B = train_fold.loc[train_fold["arm"] == "B", "outcome"].mean()
        winner = "A" if mean_A >= mean_B else "B"
        eval_rows = eval_fold[eval_fold["arm"] == winner]
        cv_success += eval_rows["outcome"].sum()
        cv_units += len(eval_rows)

cv_rate = cv_success / cv_units

# Aggregate true rate (for reference)
true_rate = (p_A + p_B) / 2  # 0.10

summary = pd.DataFrame({
    "Estimator": ["True (unknown)", "Naïve plug‑in", "2‑fold CV"],
    "Estimated Success Rate": [true_rate, naive_rate, cv_rate]
})

summary


Unnamed: 0,Estimator,Estimated Success Rate
0,True (unknown),0.1
1,Naïve plug‑in,0.11722
2,2‑fold CV,0.10199
