In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# ===========================
# From-scratch Logistic + Sigmoid LUT (No leakage)
# Features: AB, BB, SO, SB, CS -> BB_rate, SO_rate, SBatt_rate, SB_success, log_AB
# Target: Hit (Bernoulli) on synthetic AB-level logs
# ===========================
import numpy as np
import pandas as pd

rng = np.random.default_rng(42)

# ---------- 1) 더미 시즌/선수 데이터 생성 (집계 레벨) ----------
n_players = 300
seasons = np.arange(2019, 2025)  # 2019~2024 (마지막 해 테스트용)
rows = []
for y in seasons:
    aging = (y - seasons.min()) * 0.03
    for pid in range(n_players):
        # 노출량
        AB = int(max(80, rng.normal(420, 90)))
        BB = rng.binomial(AB, 0.09 + 0.01*rng.normal())
        SO = rng.binomial(AB, 0.22 - 0.03*rng.normal())
        # Clip the probability to be within [0, 1]
        sb_att_prob = np.clip(0.12 + 0.08*rng.normal(), 0, 1)
        SB_att = rng.binomial(max(1, AB//7), sb_att_prob)
        # Clip the probability to be within [0, 1]
        sb_success_prob = np.clip(0.68 + 0.1*rng.normal(), 0, 1)
        SB = rng.binomial(SB_att, sb_success_prob)
        CS = SB_att - SB

        PA_star = AB + BB
        BB_rate    = (BB + 0.5) / (PA_star + 1.0)
        SO_rate    = (SO + 0.5) / (PA_star + 1.0)
        SBatt_rate = (SB + CS + 0.5) / (PA_star + 1.0)
        SB_success = (SB + 0.5) / (SB + CS + 1.0)
        log_AB     = np.log1p(AB)

        # HPA 생성식(시뮬레이터) — 우리가 맞출 대상
        z_true = (
            -2.1
            + 3.0*(0.5 - SO_rate)   # 컨택 좋아질수록 +
            + 1.6*(BB_rate)         # 선구 +
            + 0.6*(SBatt_rate)      # 주루 시도 +
            + 0.8*(SB_success - 0.5)# 주루 성공률 +
            + 0.18*log_AB           # 노출량(표본 안정화용)
            + aging                 # 시대 효과
            + 0.2*rng.normal()      # 잡음
        )
        p_true = 1/(1+np.exp(-z_true))
        H = rng.binomial(AB, p_true)

        rows.append([pid,y,AB,BB,SO,SB,CS,H,
                     BB_rate,SO_rate,SBatt_rate,SB_success,log_AB])

df = pd.DataFrame(rows, columns=[
    'player_id','season','AB','BB','SO','SB','CS','H',
    'BB_rate','SO_rate','SBatt_rate','SB_success','log_AB'
])

# 학습/검증/테스트 분할(시간 분할)
test_year = df['season'].max()
val_year  = test_year - 1
train_df  = df[df['season'] < val_year].copy()
val_df    = df[df['season'] == val_year].copy()
test_df   = df[df['season'] == test_year].copy()

feat_cols = ['BB_rate','SO_rate','SBatt_rate','SB_success','log_AB']

# ---------- 2) AB-레벨 베르누이 로그 생성 ----------
def expand_to_ab_logs(block: pd.DataFrame) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
    """집계(한 줄) -> AB행 개수만큼 복제하여 베르누이 타깃 벡터 생성"""
    X_list, y_list, w_list = [], [], []
    for _, r in block.iterrows():
        n = int(r['AB'])
        # X는 시즌 내내 동일(집계 피처)이라고 가정
        Xrow = r[feat_cols].values.astype(float)
        Xrep = np.repeat(Xrow[None, :], n, axis=0)
        # 타깃: H개의 성공, (AB-H)개의 실패를 섞어서 생성 (순서 임의)
        y = np.zeros(n, dtype=np.int8)
        y[:int(r['H'])] = 1
        rng.shuffle(y)
        # 샘플 가중치는 동일 1
        X_list.append(Xrep); y_list.append(y); w_list.append(np.ones(n))
    return (np.vstack(X_list), np.concatenate(y_list), np.concatenate(w_list))

Xtr_raw, ytr, wtr = expand_to_ab_logs(train_df)
Xva_raw, yva, wva = expand_to_ab_logs(val_df)
Xte_raw, yte, wte = expand_to_ab_logs(test_df)

# 표준화
mu, sigma = Xtr_raw.mean(axis=0), Xtr_raw.std(axis=0) + 1e-9
Xtr = (Xtr_raw - mu) / sigma
Xva = (Xva_raw - mu) / sigma
Xte = (Xte_raw - mu) / sigma

# ---------- 3) 시그모이드 LUT + 수치안정 ----------
class SigmoidLUT:
    def __init__(self, zmin=-12.0, zmax=12.0, steps=200_001):
        self.zmin, self.zmax = zmin, zmax
        self.grid = np.linspace(zmin, zmax, steps, dtype=np.float32)
        self.table = 1.0 / (1.0 + np.exp(-self.grid))
        self.scale = (steps-1)/(zmax - zmin)
    def __call__(self, z):
        zc = np.clip(z, self.zmin, self.zmax)
        idx = ((zc - self.zmin) * self.scale).astype(np.int64)
        return self.table[idx]

sigmoid = SigmoidLUT()  # drop-in replacement

# ---------- 4) 로지스틱 회귀(미니배치 GD + L2 + 얼리스타핑) ----------
def logistic_train(
    X, y, batch=4096, lr=0.2, epochs=50, l2=1e-3,
    X_val=None, y_val=None, early_patience=5, verbose=True
):
    n, d = X.shape
    w = np.zeros(d, dtype=np.float64)
    b = 0.0

    best = {'val_brier': np.inf, 'w': w.copy(), 'b': b, 'epoch': -1}
    patience = early_patience

    def brier(y_true, p):
        return np.mean((y_true - p)**2)

    for ep in range(1, epochs+1):
        # 셔플
        idx = rng.permutation(n)
        Xs, ys = X[idx], y[idx]
        # 미니배치 반복
        for i in range(0, n, batch):
            Xe = Xs[i:i+batch]
            ye = ys[i:i+batch]
            # 예측
            z = Xe @ w + b
            p = sigmoid(z)
            # 그라디언트 (로그우도 미분의 음수 평균 + L2)
            # dL/dw = X^T (p - y)/m + l2*w
            m = len(ye)
            grad_w = Xe.T @ (p - ye) / m + l2*w
            grad_b = np.sum(p - ye) / m
            # 업데이트
            w -= lr * grad_w
            b -= lr * grad_b

        # 검증
        if X_val is not None:
            pv = sigmoid(X_val @ w + b)
            vb = brier(y_val, pv)
            if verbose:
                print(f"[epoch {ep:02d}] val Brier={vb:.5f}")
            if vb + 1e-6 < best['val_brier']:
                best = {'val_brier': vb, 'w': w.copy(), 'b': b, 'epoch': ep}
                patience = early_patience
            else:
                patience -= 1
                if patience == 0:
                    if verbose:
                        print(f"Early stop @ epoch {ep} (best={best['val_brier']:.5f})")
                    break
    return best['w'], best['b'], best['val_brier'], best['epoch']

w, b, best_brier, best_epoch = logistic_train(
    Xtr, ytr, X_val=Xva, y_val=yva, batch=8192, lr=0.25, epochs=80, l2=5e-4, early_patience=6
)
print(f"\n베스트 에폭 {best_epoch}, 검증 Brier={best_brier:.6f}")

# ---------- 5) 테스트 평가 & 신타율(HPA) 산출 ----------
def brier(y, p): return np.mean((y - p)**2)

p_te = sigmoid(Xte @ w + b)
print(f"테스트 Brier: {brier(yte, p_te):.6f}")

# 선수별 신타율(HPA) = 시즌 내 예측확률 평균
test_idx = test_df.index.repeat(test_df['AB'].astype(int).values)
test_players = test_df.loc[test_df.index, ['player_id','AB','H']].to_numpy()
# 각 AB 로그가 어느 행(선수)에 속했는지 매핑
owner = np.concatenate([np.full(int(ab), k, dtype=np.int32) for k, ab in enumerate(test_df['AB'].astype(int))])

by_player = {}
for k, prob in zip(owner, p_te):
    by_player.setdefault(k, []).append(prob)

leaderboard = []
for k, plist in by_player.items():
    pid = int(test_df.iloc[k]['player_id'])
    ABk = int(test_df.iloc[k]['AB']); Hk = int(test_df.iloc[k]['H'])
    HPA = float(np.mean(plist))
    AVG = Hk/ABk
    leaderboard.append([pid, ABk, Hk, AVG, HPA, HPA-AVG])

lead = pd.DataFrame(leaderboard, columns=['player_id','AB','H','AVG','HPA','Delta']) \
          .sort_values('Delta', ascending=False)

print("\n=== 테스트 상위 반등 후보 (HPA > AVG) ===")
print(lead.head(10).to_string(index=False, float_format=lambda x: f"{x:.3f}"))

print("\n=== 테스트 과대평가 후보 (HPA < AVG) =====")
print(lead.tail(10).sort_values('Delta').to_string(index=False, float_format=lambda x: f"{x:.3f}"))

# 해석용 가중치(표준화 입력 기준)
coef_table = pd.DataFrame({
    'feature': feat_cols,
    'coef(stdized)': w
}).sort_values('coef(stdized)', ascending=False)
print("\n=== 계수(표준화 기준) ===")
print(coef_table.to_string(index=False, float_format=lambda x: f"{x:.4f}"))

[epoch 01] val Brier=0.24454
[epoch 02] val Brier=0.24452
[epoch 03] val Brier=0.24457
[epoch 04] val Brier=0.24453
[epoch 05] val Brier=0.24448
[epoch 06] val Brier=0.24447
[epoch 07] val Brier=0.24452
[epoch 08] val Brier=0.24457
[epoch 09] val Brier=0.24456
[epoch 10] val Brier=0.24457
[epoch 11] val Brier=0.24443
[epoch 12] val Brier=0.24451
[epoch 13] val Brier=0.24457
[epoch 14] val Brier=0.24453
[epoch 15] val Brier=0.24448
[epoch 16] val Brier=0.24462
[epoch 17] val Brier=0.24448
Early stop @ epoch 17 (best=0.24443)

베스트 에폭 11, 검증 Brier=0.244427
테스트 Brier: 0.242652

=== 테스트 상위 반등 후보 (HPA > AVG) ===
 player_id  AB   H   AVG   HPA  Delta
       270 375 154 0.411 0.525  0.114
         9 465 236 0.508 0.612  0.105
       155 337 133 0.395 0.497  0.102
        97 435 208 0.478 0.568  0.090
        67 442 198 0.448 0.537  0.089
       210 381 173 0.454 0.532  0.077
       215 418 216 0.517 0.590  0.073
       251 349 133 0.381 0.453  0.072
       164 466 265 0.569 0.639  0.070
      