In [1]:
import warnings
warnings.filterwarnings('ignore', category=RuntimeWarning)

import pandas as pd
import numpy as np
from tqdm import tqdm
from pandas.api.types import is_datetime64tz_dtype

FEATURES_PATH = "data/vlr_features_demo.csv"
MAP_PRED_PATH = "data/vlr_map_preds_demo.csv"
OUTPUT_PATH   = "data/vlr_agent_probs_demo.csv"

#hyperparams, placceholder values used
PATCH_LAM = 0.5 #patch decay rate
REC_LAM   = 0.5 #time-based recency decay
CORE_LAM  = 0.5 #roster continuity decay
MOV_LAM   = 0.5 #margin of victory weight
PATCH_WINDOW = 1 #number of patch versions to consider
REQ_SAMPLE_RATIO = 5 / 7 
BATCH_SIZE = 1000 #processing batch size
PLAYER_WEIGHT = 0.6 
TEAM_WEIGHT   = 0.2
META_WEIGHT   = 0.2
PLAYER_NT_HISTORICAL_WEIGHT = 0.5 #weight for non-team-specific player history
MIN_CUTOFF = 0.05 #minimum probability threshold for redistribution

#role caps with placeholders
ROLE_HARD_CAP = {"Duelist": 1.5, "Initiator": 1.7, "Controller": 1.5, "Sentinel": 1.9}
AGENT_ROLE = {
    'Jett':'Duelist','Reyna':'Duelist','Phoenix':'Duelist','Yoru':'Duelist','Neon':'Duelist',
    'Raze':'Duelist','Iso':'Duelist','Waylay':'Duelist',
    'Sova':'Initiator','Skye':'Initiator','Breach':'Initiator','Kayo':'Initiator',
    'Fade':'Initiator','Gekko':'Initiator','Tejo':'Initiator',
    'Brimstone':'Controller','Omen':'Controller','Viper':'Controller','Astra':'Controller',
    'Harbor':'Controller','Clove':'Controller',
    'Sage':'Sentinel','Killjoy':'Sentinel','Cypher':'Sentinel','Chamber':'Sentinel',
    'Deadlock':'Sentinel','Vyse':'Sentinel'
}

#standardize team names; trim whitespace, collapse known aliases
def canonicalize_team(name: str) -> str:
    if not isinstance(name, str): return str(name)
    s = name.strip()
    return "NRG" if s.lower() == "nrg esports" else s

#compute roster stability weight from days since columns using exponential decay (higher gap=lower confidence)
def roster_confidence(r, lam):
    return (0.2 * (1 - np.exp(-lam * r['days_since_3_of_5'])) +
            0.3 * (1 - np.exp(-lam * r['days_since_4_of_5'])) +
            0.5 * (1 - np.exp(-lam * r['days_since_5_of_5'])))

#bucket numeric valeus to nearest lowest hundred; invalids coerced to zero. more intuitive patch handling
def to_bucket(s):
    s = pd.to_numeric(s, errors='coerce').fillna(0).astype(int)
    return (s // 100) * 100

features = pd.read_csv(FEATURES_PATH, parse_dates=['series_datetime'])

map_preds = pd.read_csv(MAP_PRED_PATH, parse_dates=['series_datetime_canonical'])
map_preds['series_datetime'] = map_preds['series_datetime_canonical']

for df in (features, map_preds):
    #ensure parsed then strip tz if present
    df['series_datetime'] = pd.to_datetime(df['series_datetime'], errors='coerce')
    if is_datetime64tz_dtype(df['series_datetime']):
        df['series_datetime'] = df['series_datetime'].dt.tz_convert(None)

#clean features
features['patch_index'] = pd.to_numeric(features['patch_index'], errors='coerce').fillna(0).astype(int)
features['days_since_last_match'] = pd.to_numeric(features['days_since_last_match'], errors='coerce').fillna(999)
for k in (3,4,5):
    features[f'days_since_{k}_of_5'] = pd.to_numeric(features.get(f'days_since_{k}_of_5', 999), errors='coerce').fillna(999).astype(int)
features['margin_of_victory'] = pd.to_numeric(features.get('margin_of_victory', 0), errors='coerce').fillna(0)
features['player_team_full'] = features['player_team_full'].astype(str).map(canonicalize_team)
features['map_name'] = features['map_name'].astype(str).str.lower()
features['role'] = features['player_agent'].map(AGENT_ROLE)
features = features.drop_duplicates(subset=['series_datetime','player_handle','map_name','player_agent'])
features['patch_bucket'] = to_bucket(features['patch_index'])

#build per-match team rosters; unique, sorted player handles per (team, series_datetime)
team_match_rosters = (
    features
      .groupby(['player_team_full','series_datetime'], as_index=False)['player_handle']
      .apply(lambda s: sorted(set(s)))
      .rename(columns={'player_handle':'roster_handles'})
)

#cache recent unique players overall per team to use as fallback
features_sorted = features.sort_values(['player_team_full','series_datetime'], ascending=[True, False])

#derive patch_bucket from patch_index, patch_major
if 'patch_index' in map_preds.columns:
    map_preds['patch_bucket'] = to_bucket(map_preds['patch_index'])
elif 'patch_major' in map_preds.columns:
    map_preds['patch_bucket'] = to_bucket(map_preds['patch_major'] * 100)
else:
    raise KeyError("map_preds must have either 'patch_index' or 'patch_major'")

#canonicalize team names
for col in ['team1_name','team2_name']:
    if col in map_preds.columns:
        map_preds[col] = map_preds[col].astype(str).map(canonicalize_team)

#lowercase map names for consistent joins
map_preds['map_name'] = map_preds['map_name'].astype(str).str.lower()

#indexes
all_agents = sorted(features['player_agent'].unique()) 
features_team_map = features.set_index(['player_handle','map_name','player_team_full']).sort_index()
features_by_player_map = features.set_index(['player_handle','map_name']).sort_index()
player_map_cache = {}

#compute agent pick probabilities weighted by patch recency, match recency, roster confidence, MOV
def weighted_probs(df, patch_bucket):
    if df.empty:
        return pd.Series(0.0, index=all_agents)
    dp = np.maximum(0, patch_bucket - df['patch_bucket'])
    w_patch = 1.0 / (1.0 + PATCH_LAM * np.log1p(dp))
    w_time  = np.exp(-REC_LAM * df['days_since_last_match'])
    conf    = roster_confidence(df, CORE_LAM)
    mov     = 1 + MOV_LAM * df['margin_of_victory']
    wt      = w_patch * w_time * conf * mov
    sums    = pd.Series(wt).groupby(df['player_agent']).sum()
    out     = sums.reindex(all_agents, fill_value=0.0)
    return out/out.sum() if out.sum()>0 else pd.Series(0.0, index=all_agents)

#compute team and meta level agent pick probabilities on a given map prior to dt, weighted by patch/time decay
def get_team_meta_probs(team_name, mp, patch_bucket, dt):
    team_hist = features[(features['map_name']==mp) & (features['player_team_full']==team_name) & (features['series_datetime']<dt)].copy()
    meta      = features[(features['map_name']==mp) & (features['series_datetime']<dt) & (features['player_team_full']!=team_name)].copy()
    if PATCH_WINDOW:
        team_hist = team_hist[team_hist['patch_bucket'] >= (patch_bucket - PATCH_WINDOW*100)]
        meta      = meta[meta['patch_bucket'] >= (patch_bucket - PATCH_WINDOW*100)]
    team_probs = weighted_probs(team_hist, patch_bucket)
    meta_probs = weighted_probs(meta, patch_bucket)
    return team_probs.reindex(all_agents, fill_value=0.0), meta_probs.reindex(all_agents, fill_value=0.0)

#compute player-level agent pick probabilities on a given map before dt, preferring team-specific history and falling back to global player history with caching
def get_player_probs(player, mp, team, patch_bucket, dt):
    try:
        hist = features_team_map.loc[(player, mp, team)]
        hist = hist[hist['series_datetime'] < dt]
        if PATCH_WINDOW:
            hist = hist[hist['patch_bucket'] >= (patch_bucket - PATCH_WINDOW*100)]
        if not hist.empty:
            return weighted_probs(hist, patch_bucket)
    except KeyError:
        pass
    key = (player, mp, patch_bucket, dt)
    if key in player_map_cache:
        return player_map_cache[key]
    try:
        hist = features_by_player_map.loc[(player, mp)]
        hist = hist[hist['series_datetime'] < dt]
        if PATCH_WINDOW:
            hist = hist[hist['patch_bucket'] >= (patch_bucket - PATCH_WINDOW*100)]
        p = weighted_probs(hist, patch_bucket) if not hist.empty else pd.Series(0.0, index=all_agents)
    except KeyError:
        p = pd.Series(0.0, index=all_agents)
    res = p * PLAYER_NT_HISTORICAL_WEIGHT
    player_map_cache[key] = res
    return res

#get most recent active 5-player roster for a team as of a specific datetime with edge case handling
def get_active_roster(team: str, dt: pd.Timestamp, features_df: pd.DataFrame) -> list[str]:

    team = canonicalize_team(team)
    # Latest prior team match roster
    prior = team_match_rosters[(team_match_rosters['player_team_full'] == team) &
                               (team_match_rosters['series_datetime'] < dt)]
    if not prior.empty:
        latest_idx = prior['series_datetime'].idxmax()
        last_roster = list(prior.loc[latest_idx, 'roster_handles'])
    else:
        last_roster = []

    #if roster has 5 players, return it without further modification
    if len(last_roster) >= 5:
        return last_roster[:5]

    #fallback: fill with most recent unique players overall (excluding those already selected)
    team_hist = features_sorted[(features_sorted['player_team_full'] == team) &
                                (features_sorted['series_datetime'] < dt)]
    recent_uniques = team_hist.drop_duplicates(subset=['player_handle'])['player_handle'].tolist()
    #merge, preserving last_roster order first
    seen = set(last_roster)
    for ph in recent_uniques:
        if ph not in seen:
            last_roster.append(ph)
            seen.add(ph)
        if len(last_roster) >= 5:
            break

    return last_roster[:5]

#main loop; split map_preds into record batches for processing, each batch contains up to specified BATCH_SIZE rows
rows = map_preds.to_dict(orient='records')
batches = [rows[i:i+BATCH_SIZE] for i in range(0, len(rows), BATCH_SIZE)]
results = []
print(f"Processing {len(rows)} rows across all matches...")

#generates player-agent prob records for team/map combos by combining player, team, and meta distributions weighted by map prediction probs
for batch in tqdm(batches, desc="Processing", unit="batch"):
    for row in batch:
        match_id     = row['MatchID']
        dt           = row['series_datetime'] 
        patch_bucket = row['patch_bucket']
        mp           = row['map_name']
        prob         = row['map_predicted_prob']

        # enforce last match's roster (up to 5), with fallback to recent uniques
        team_names = []
        if 'team1_name' in row and 'team2_name' in row:
            team_names = [row['team1_name'], row['team2_name']]
        else:
            team_names = list({row.get('team_left_full'), row.get('team_right_full')})

        for team in team_names:
            team = canonicalize_team(team)
            players = get_active_roster(team, dt, features)
            if len(players) > 5:
                players = players[:5]

            team_probs, meta_probs = get_team_meta_probs(team, mp, patch_bucket, dt)

            for player in players:
                p_probs = get_player_probs(player, mp, team, patch_bucket, dt)
                final = PLAYER_WEIGHT*p_probs + TEAM_WEIGHT*team_probs + META_WEIGHT*meta_probs
                if final.sum() > 0:
                    final = final / final.sum()
                for agent, ap in final.items():
                    results.append({
                        'MatchID': match_id,
                        'series_datetime': dt,
                        'player_handle': player,
                        'player_agent': agent,
                        'map_name': mp,
                        'player_team_full': team,
                        'role': AGENT_ROLE.get(agent, 'NA'),
                        'per_map_prob': ap,
                        'raw_prob': ap * prob,
                        'map_prob': prob
                    })

df = pd.DataFrame(results)

#series-level normalization
map_counts = map_preds.groupby('MatchID')['map_was_played'].sum()
df['num_maps'] = df['MatchID'].map(map_counts)
df['normalized_prob'] = df.groupby(['MatchID','player_handle'])['raw_prob'] \
                          .transform(lambda x: (x/x.sum()) if x.sum()>0 else x) * df['num_maps']

#role caps applied at team level
role_totals = df.groupby(['MatchID','map_name','player_team_full','role'], observed=True)['normalized_prob'] \
                .sum().reset_index().rename(columns={'normalized_prob':'expected_role_count'})
df = df.merge(role_totals, on=['MatchID','map_name','player_team_full','role'], how='left')
cap = df['role'].map(ROLE_HARD_CAP).fillna(np.inf)
scale = np.where(df['expected_role_count'] > cap, cap/df['expected_role_count'], 1.0)
df['adjusted_prob'] = df['normalized_prob'] * scale

#re-normalization on matchid, player_handle, map_name after team scaling
df['adjusted_prob'] = df.groupby(['MatchID','player_handle','map_name'])['adjusted_prob'] \
                        .transform(lambda x: (x/x.sum()) if x.sum()>0 else x)

#min_cutoff redistribution; agents with extremely low probabilities like generated from meta dist are redistributed to higher probability agents that make more sense at a player level
low  = df[df['adjusted_prob'] < MIN_CUTOFF].copy()
keep = df[df['adjusted_prob'] >= MIN_CUTOFF].copy()
rem  = keep.groupby(['MatchID','player_handle','map_name'])['adjusted_prob'].sum().reset_index().rename(columns={'adjusted_prob':'remaining'})
add  = low.groupby(['MatchID','player_handle','map_name'])['adjusted_prob'].sum().reset_index().rename(columns={'adjusted_prob':'to_redist'})
keep = keep.merge(rem, on=['MatchID','player_handle','map_name'], how='left') \
           .merge(add, on=['MatchID','player_handle','map_name'], how='left').fillna(0.0)
mask = keep['remaining'] > 0
keep.loc[mask, 'adjusted_prob'] = keep.loc[mask, 'adjusted_prob'] * (1 + keep.loc[mask, 'to_redist'] / keep.loc[mask, 'remaining'])
keep = keep.drop(columns=['remaining','to_redist'])
keep['adjusted_prob'] = keep.groupby(['MatchID','player_handle','map_name'])['adjusted_prob'] \
                            .transform(lambda x: (x/x.sum()) if x.sum()>0 else x)

#convert per-map probabilities into final series (match) level probabilities; normalize
keep['raw_series_prob'] = keep['adjusted_prob'] * keep['map_prob']
keep['final_series_prob'] = keep.groupby(['MatchID','player_handle'])['raw_series_prob'] \
                               .transform(lambda x: (x/x.sum()) if x.sum()>0 else x) * keep['num_maps']

agg = (keep.groupby(['MatchID','player_handle','player_agent','map_name'], as_index=False)
          .agg({'adjusted_prob':'sum','map_prob':'sum','per_map_prob':'sum','final_series_prob':'sum'}))
meta = keep[['MatchID','player_handle','player_agent','map_name','series_datetime','player_team_full','role']] \
       .drop_duplicates()
grouped = agg.merge(meta, on=['MatchID','player_handle','player_agent','map_name'], how='left')

grouped = grouped[['MatchID','series_datetime','player_handle','player_agent','map_name',
                   'player_team_full','role','adjusted_prob','map_prob','per_map_prob','final_series_prob']]

grouped.to_csv(OUTPUT_PATH, index=False)
print(f"Done. Deduped output saved to: {OUTPUT_PATH}")


  if is_datetime64tz_dtype(df['series_datetime']):
  if is_datetime64tz_dtype(df['series_datetime']):


Processing 6230 rows across all matches...


Processing: 100%|██████████| 7/7 [12:24<00:00, 106.39s/batch]


Done. Deduped output saved to: /Users/samharwood/Downloads/agent_probs_demo.csv
