In [2]:
import pandas as pd
import numpy as np
from collections import defaultdict

DATA_PATH     = "data/vlr_patchpool_demo.csv"
FEATURES_PATH = "data/vlr_features_demo.csv"
OUTPUT_PATH   = "data/vlr_map_preds_demo.csv"
MIN_MATCHES   = 5

# Hyperparams (demo values; not tuned. Placeholder values also in place for picks/bans)
PATCH_LAM  = 0.5 #patch decay
REC_LAM    = 0.5 #time based recency decay
CORE_LAM   = 0.5 #roster continuity decay
MOV_LAM    = 0.5 #margin of victory weight
MARGIN_W   = 0.5 #relative weight for MOV signal vs pick and ban signal
EPSILON    = 0.5 #baseline probability floor

#pick/ban weights; positive values increase map probability, negative values decrease it
PB_WEIGHTS = {
    "pick_1":  0.5, "pick_2": 0.5, "pick_3": 0.5,
    "ban_1":  -0.5, "ban_2": -0.5, "ban_3": -0.5
}

def canonicalize_team(s: str) -> str:
    s = str(s).strip()
    return "NRG" if s.lower() == "nrg esports" else s

#round ts made to conform to nearest 30m slot
def canonical_match_time(ts: pd.Timestamp) -> pd.Timestamp:
    ts = pd.Timestamp(ts)
    m = ts.minute + ts.second/60 + ts.microsecond/6e7
    if m < 15:  return ts.floor("h").replace(minute=0,  second=0, microsecond=0)
    if m < 45:  return ts.floor("h").replace(minute=30, second=0, microsecond=0)
    return (ts.floor("h") + pd.Timedelta(hours=1)).replace(minute=0, second=0, microsecond=0)

#parses bracketed map pool str into list
def parse_active_pool_string(s):
    if pd.isna(s): return []
    ss = str(s).strip().strip("[]").replace("'", "")
    parts = [p.strip().lower() for p in ss.split(",") if p.strip()]
    return parts

#returns expected maps to win based on series duration
def pick_expected_maps(best_of_value):
    try:
        bo = int(best_of_value)
        return 2 if bo == 3 else (3 if bo == 5 else 2)
    except Exception:
        return 2
        
# Stable softmax scaled to target_sum/series duration; NaN/overflow handling
def softmax_to_target(x, target_sum):
    x = np.asarray(x, dtype=float)
    if x.size == 0: return x
    x = x - np.nanmax(x)
    ex = np.exp(x)
    s = ex.sum()
    if not np.isfinite(s) or s <= 0:
        return np.full_like(x, fill_value=(target_sum / max(len(x), 1)))
    return ex / s * target_sum

# Normalize probs so they sum to target_sum while enforcing an upper cap per entry.
# Remaining probability mass is redistributed fairly to non-capped map probability entries
def cap_and_waterfill(probs, target_sum, cap=1.0, tol=1e-9, max_iter=50):
    p = np.clip(np.asarray(probs, dtype=float), 0.0, np.inf)
    p = np.minimum(p, cap)
    total = p.sum()
    if target_sum <= 0: return np.zeros_like(p)
    if abs(total - target_sum) <= tol: return p
    if total > target_sum + tol: return p * (target_sum / total)
    remaining = target_sum - total
    mask = p < cap
    it = 0
    while remaining > tol and mask.any() and it < max_iter:
        room = np.where(mask, cap - p, 0.0)
        room_sum = room.sum()
        if room_sum <= tol: break
        add = remaining * (room / room_sum)
        p = p + add
        over = p > cap
        if over.any(): p[over] = cap
        remaining = target_sum - p.sum()
        mask = p < cap - 1e-12
        it += 1
    return p

df = pd.read_csv(DATA_PATH)
features = pd.read_csv(FEATURES_PATH, parse_dates=["series_datetime"])


df["team1_name"] = df["team1_name"].astype(str).str.strip()
df["team2_name"] = df["team2_name"].astype(str).str.strip()
df["series_datetime"] = pd.to_datetime(df["series_datetime"], errors="coerce")
df["team1_name_canon"] = df["team1_name"].apply(canonicalize_team)
df["team2_name_canon"] = df["team2_name"].apply(canonicalize_team)

#lowercase parsing of active pool
df["match_active_pool"] = df["ActiveMapPool"].apply(parse_active_pool_string)

#fill value for roster continuity; if not present use large sample value
for k in (3, 4, 5):
    col = f"days_since_{k}_of_5"
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce").fillna(999).astype(int)
    else:
        df[col] = 999

#remove fake teams
mask = (df["team1_name"].str.contains(r"\(.*\)", regex=True, na=False)
        | df["team2_name"].str.contains(r"\(.*\)", regex=True, na=False))
df = df[~mask].reset_index(drop=True)

#assign MatchID if missing
df["MatchID"] = (df["MatchID"].astype(str) if "MatchID" in df.columns else df.index.astype(str))

#unique series_datetime, patch_index pairs sorted chronologically
patch_timeline = (features[["series_datetime", "patch_index"]]
                  .dropna().drop_duplicates().sort_values("series_datetime"))

#as-of lookup that returns most recent patch_index active at timestamp ts
def patch_index_asof(ts):
    if pd.isna(ts) or len(patch_timeline) == 0: return 0.0
    idx = patch_timeline["series_datetime"].searchsorted(pd.Timestamp(ts), side="left") - 1
    if idx >= 0: return float(patch_timeline.iloc[idx]["patch_index"])
    return float(patch_timeline.iloc[0]["patch_index"])

#active pool timeline from patchpool
pp_pools = (df[["series_datetime", "ActiveMapPool"]]
            .dropna().drop_duplicates().sort_values("series_datetime"))

#returns active map pool as of timestamp ts; finds latest pool entry before ts and falls back to prior map list if current isn't complete (7 maps)
def active_pool_asof(ts):
    if len(pp_pools) == 0: return []
    idx = pp_pools["series_datetime"].searchsorted(pd.Timestamp(ts), side="left") - 1
    if idx < 0: idx = 0
    pool = parse_active_pool_string(pp_pools.iloc[idx]["ActiveMapPool"])
    if len(pool) < 7 and idx > 0:
        for j in range(idx, -1, -1):
            alt = parse_active_pool_string(pp_pools.iloc[j]["ActiveMapPool"])
            if len(alt) >= 7: return list(dict.fromkeys(alt))
    return list(dict.fromkeys(pool))

#builds per-team match history dict; for each canonical team, collect and sort all matches chronologically
team_histories = {}
all_teams_canon = pd.unique(pd.concat([df["team1_name_canon"], df["team2_name_canon"]]))
for team in all_teams_canon:
    hist = df[(df["team1_name_canon"] == team) | (df["team2_name_canon"] == team)].copy()
    hist = hist.sort_values("series_datetime").reset_index(drop=True)
    team_histories[team] = hist

#extract and clean core feature fields
feat = features[['game_id', 'series_datetime', 'player_team_full', 'player_handle']].dropna()
feat['game_id'] = pd.to_numeric(feat['game_id'], errors='coerce')
feat = feat.dropna(subset=['game_id'])
feat['player_team_full'] = feat['player_team_full'].astype(str).str.strip()
feat['player_handle'] = feat['player_handle'].astype(str).str.strip()

#per game_id, team roster sets built
feat_rosters = (
    feat.groupby(['player_team_full', 'game_id'])['player_handle']
        .apply(lambda s: frozenset(set(s)))
        .reset_index()
)

# Attach per-game series_datetime for ordering
gid_time = (features[['game_id', 'series_datetime']]
            .dropna()
            .drop_duplicates()
            .assign(series_datetime=lambda d: pd.to_datetime(d['series_datetime'], errors='coerce')))

#merge maths timestamps, drop unmatched rosters, sort rosters chronologically at team level
feat_rosters = feat_rosters.merge(gid_time, on='game_id', how='left')
feat_rosters = feat_rosters.dropna(subset=['series_datetime'])
feat_rosters = feat_rosters.sort_values(['player_team_full', 'series_datetime'])

#detect full roster resets (no shared players with prior lineup), record reset timestamps per team
team_reset_times = {}  # keys will include both original label and canonicalized label for robustness
for team_label, g in feat_rosters.groupby('player_team_full'):
    g = g.sort_values('series_datetime').reset_index(drop=True)
    resets = []
    prev_r = None
    for _, row in g.iterrows():
        rset = row['player_handle']
        t = row['series_datetime']
        if prev_r is not None:
            if len(prev_r.intersection(rset)) == 0:
                resets.append(pd.Timestamp(t))
        prev_r = rset
    team_reset_times[team_label] = resets
    canon = canonicalize_team(team_label)
    if canon not in team_reset_times:
        team_reset_times[canon] = resets
        
#return latest full-roster reset at team level that occurred before timestamp ts
def last_reset_before(team_label: str, ts: pd.Timestamp):
    if team_label not in team_reset_times or not team_reset_times[team_label]:
        return None
    times = [rt for rt in team_reset_times[team_label] if pd.Timestamp(rt) < pd.Timestamp(ts)]
    return max(times) if times else None

def compute_team_map_scores(hist, team_canon, patch_asof, sched_ts, pool_lc):
    #initialize accumulators for picks/bans signal, round diff MOV signal
    pb, mv = defaultdict(float), defaultdict(float)
    team_rows = hist[(hist["team1_name_canon"] == team_canon) | (hist["team2_name_canon"] == team_canon)]
    if team_rows.empty: return {m: EPSILON for m in pool_lc}
    #check if map name/map rd exist for MOV based contribution
    has_map_cols = all([(f"Map{i}_name" in hist.columns and f"Map{i}_RD" in hist.columns) for i in range(1,6)])

    #filter team rows on either side from history
    for _, r in team_rows.iterrows():
        if pd.to_numeric(r.get("days_since_3_of_5", 999), errors="coerce") == 0:
            continue
        #patch weight computation; further in the past/away from current patch means lower weight
        prev_patch = float(pd.to_numeric(r.get("patch_index", np.nan), errors="coerce"))
        if not np.isfinite(prev_patch):
            prev_patch = patch_index_asof(r["series_datetime"])
        dp      = max(0.0, float(patch_asof) - prev_patch)
        w_patch = 1.0 / (1.0 + PATCH_LAM * np.log1p(dp))

        #time decay weight computation
        dt_days = (pd.Timestamp(sched_ts) - r["series_datetime"]).total_seconds() / 86400.0 if pd.notna(r["series_datetime"]) else 999.0
        w_time  = float(np.exp(-REC_LAM * max(0.0, dt_days)))

        #roster continuity confidence calculated
        d3 = pd.to_numeric(r.get("days_since_3_of_5", 999), errors="coerce"); d3 = 999.0 if pd.isna(d3) else float(d3)
        d4 = pd.to_numeric(r.get("days_since_4_of_5", 999), errors="coerce"); d4 = 999.0 if pd.isna(d4) else float(d4)
        d5 = pd.to_numeric(r.get("days_since_5_of_5", 999), errors="coerce"); d5 = 999.0 if pd.isna(d5) else float(d5)
        conf = 0.2*(1-np.exp(-CORE_LAM*d3)) + 0.3*(1-np.exp(-CORE_LAM*d4)) + 0.5*(1-np.exp(-CORE_LAM*d5))

        w = w_patch * w_time * conf
        if w <= 0: continue

        #determine which side team was on for PB column prefixes
        if r.get("team1_name_canon") == team_canon:
            prefixes = ("team1_",)
        elif r.get("team2_name_canon") == team_canon:
            prefixes = ("team2_",)
        else:
            prefixes = ()

        #accumulate pick/ban signal for maps in the current pool
        for base, wt in PB_WEIGHTS.items():
            for pref in prefixes:
                m = r.get(pref + base)
                if isinstance(m, str):
                    mm = m.strip().lower()
                    if mm in pool_lc:
                        pb[mm] += wt * w
                        
        #accumulate MOV signal at map level
        if has_map_cols:
            for i in range(1, 6):
                m  = r.get(f"Map{i}_name")
                rd = r.get(f"Map{i}_RD")
                if isinstance(m, str) and pd.notna(rd):
                    mm = m.strip().lower()
                    if mm in pool_lc:
                        mv[mm] += float(rd) * w * MOV_LAM
                        
    #combine PB, MOV, weight signals with epsilon
    out = {}
    for m in pool_lc:
        out[m] = pb.get(m, 0.0) + mv.get(m, 0.0) * MARGIN_W + EPSILON
    return out

#build per-match context: get canonical start time, 2h prematch cutoff, canonical team names
contexts = []
df_matches = df.drop_duplicates(subset="MatchID")
for _, row in df_matches.iterrows():
    raw_ts = pd.Timestamp(row["series_datetime"])
    sched  = canonical_match_time(raw_ts)
    cutoff = sched - pd.Timedelta(hours=2)

    t1_c = row["team1_name_canon"]; t2_c = row["team2_name_canon"]

#reset-aware history window: restrict each teamâ€™s past matches to its current roster era before cutoff
    lr1 = last_reset_before(t1_c, cutoff)
    if lr1 is None:
        lr1 = last_reset_before(str(row.get("team1_name", "")), cutoff)
    lr2 = last_reset_before(t2_c, cutoff)
    if lr2 is None:
        lr2 = last_reset_before(str(row.get("team2_name", "")), cutoff)

    base_h1 = team_histories[t1_c][team_histories[t1_c]["series_datetime"] < cutoff]
    base_h2 = team_histories[t2_c][team_histories[t2_c]["series_datetime"] < cutoff]

    h1 = base_h1[base_h1["series_datetime"] >= lr1] if lr1 is not None else base_h1
    h2 = base_h2[base_h2["series_datetime"] >= lr2] if lr2 is not None else base_h2

    if len(h1) < MIN_MATCHES or len(h2) < MIN_MATCHES:
        continue

    patch_asof = patch_index_asof(sched)
    pool = active_pool_asof(sched)
    if not isinstance(pool, list) or len(pool) < 7:
        continue
    # lowercased pool, stable order
    pool = list(dict.fromkeys([p.strip().lower() for p in pool if isinstance(p, str) and p.strip()]))

    #expected maps by best_of; keep played maps only for evaluation
    best_of = row.get("match_best_of", 3)
    target_map_count = pick_expected_maps(best_of)

    #extract valid player maps from this match; limit to target_map_count, filter to current pool
    played_maps = []
    for i in range(1, 6):
        m = row.get(f"Map{i}_name")
        if isinstance(m, str):
            mm = m.strip().lower()
            if mm in pool:
                played_maps.append(mm)
        if len(played_maps) >= target_map_count:
            break
    #skip matches with incomplete, invalid map data
    if len(played_maps) != target_map_count:
        continue
    #store as set for quick membership checks
    played_set = set(played_maps)

    contexts.append((h1, h2, pool, sched, patch_asof, played_set, t1_c, t2_c, row["MatchID"], raw_ts, target_map_count))

#compute per-match combined maps scores for both teams
csv_rows = []
for h1, h2, pool, sched, patch_asof, played_set, t1_c, t2_c, match_id, raw_ts, target_sum in contexts:
    #compute individual team map scores weighted by patch/time/roster features
    s1 = compute_team_map_scores(h1, t1_c, patch_asof, sched, pool)
    s2 = compute_team_map_scores(h2, t2_c, patch_asof, sched, pool)
    #combine both teams' scores per map into joint raw score vector 
    raw_scores = np.array([s1[m] + s2[m] for m in pool], dtype=float)

    base = softmax_to_target(raw_scores, target_sum)
    final_probs = cap_and_waterfill(base, target_sum, cap=1.0)

    for m, p in zip(pool, final_probs):
        csv_rows.append({
            "MatchID": match_id,
            "team1_name": t1_c,
            "team2_name": t2_c,
            "series_datetime_provider": raw_ts,         # for audit
            "series_datetime_canonical": sched,         # used for computation
            "map_name": m,                              # lowercased
            "map_predicted_prob": float(p),
            "expected_maps": target_sum,
            "map_was_played": 1.0 if m in played_set else 0.0,
            "patch_index_asof": float(patch_asof),
            "patch_index": float(patch_asof),           # kept for backward-compat
        })

df_out = pd.DataFrame(csv_rows)
df_out = df_out.sort_values(["series_datetime_canonical","MatchID","map_name"]).reset_index(drop=True)

#sanity check for no duplicate matchid, map_name combos
assert not df_out.duplicated(subset=["MatchID","map_name"]).any(), "Duplicate (MatchID, map_name)!"

#verifies cap logic, ensuring no map probability exceeds 1.0
viol = (df_out["map_predicted_prob"] > 1.0 + 1e-9).sum()
if viol:
    raise AssertionError(f"Found {viol} rows with map_predicted_prob > 1.0 (cap failed).")

#check softmax normalization; per-match map probabilities sum= expected maps
sums = (df_out.groupby(["MatchID","expected_maps"])
              .agg(pred_sum=("map_predicted_prob","sum")).reset_index())
drift = (sums["pred_sum"] - sums["expected_maps"]).abs().max() if len(sums) else 0.0
if drift and drift > 1e-6:
    print(f"Note: tiny mass drift observed (max {drift:.6f}); acceptable.")

df_out.to_csv(OUTPUT_PATH, index=False)
print(f"Map preds saved: {OUTPUT_PATH}, rows={len(df_out)}")

Map preds saved: /Users/samharwood/Downloads/map_preds_demo.csv, rows=6230
