In [34]:
import pandas as pd
import numpy as np
import re
from datetime import timedelta

ROUNDSTATS_CSV   = "data_public/vlr_roundstats_2023.csv"
PLAYER_STATS_CSV = "data_public/vlr_playerstats_2023.csv"
MATCHDATA_CSV    = "data_public/vlr_matchstats_2023.csv"
OUT_PATH         = "outputs/vlr_patchpool_demo.csv"

roundstats   = pd.read_csv(ROUNDSTATS_CSV)
player_stats = pd.read_csv(PLAYER_STATS_CSV)
matchdata    = pd.read_csv(MATCHDATA_CSV)

# Converts match_datetime into datetime64 objects; error handling/force to UTC
matchdata['match_datetime'] = pd.to_datetime(matchdata.get('match_datetime'), utc=True, errors='coerce')
matchdata['game_id']        = pd.to_numeric(matchdata.get('game_id'), errors='coerce')

roundstats['game_id']       = pd.to_numeric(roundstats.get('game_id'), errors='coerce')
player_stats['game_id']     = pd.to_numeric(player_stats.get('game_id'), errors='coerce')

# Guarantees player_handle stored as string; important for lowercase normalization downstream
player_stats['player_handle'] = player_stats['player_handle'].astype(str)

# Converts wide roundstats (both teams per row) into long team-level rows, enabling clean Round_Diff aggregation later
team_round_rows = []
for _, row in roundstats.iterrows():
    gid = row['game_id']
    team_round_rows.append({'game_id': gid, 'team_num': 1, 'result': row.get('team1_result')})
    team_round_rows.append({'game_id': gid, 'team_num': 2, 'result': row.get('team2_result')})

# Create team-level round df; drop invalid ids, add binary win indicator
round_df = pd.DataFrame(team_round_rows)
round_df = round_df.dropna(subset=['game_id'])
round_df['is_win'] = (round_df['result'] == 'won').astype(int)

# Group by game_id/team_num to count total rounds/wins, then calculate per-team round diff
total_rounds  = round_df.groupby(['game_id','team_num']).size().reset_index(name='total_rounds')
wins_by_team  = round_df.groupby(['game_id','team_num'])['is_win'].sum().reset_index(name='wins')
rd_per_team   = pd.merge(total_rounds, wins_by_team, on=['game_id','team_num'])
rd_per_team['Round_Diff'] = 2*rd_per_team['wins'] - rd_per_team['total_rounds']

# Nested lookup to retrieve individual team round diffs at a game_id level
rd_lookup = {}
for _, r in rd_per_team.iterrows():
    rd_lookup.setdefault(int(r['game_id']), {})[int(r['team_num'])] = int(r['Round_Diff'])

pb_cols = [f"picks_and_bans.{i}" for i in range(1, 8)]

# Picks and bans parsing, stripping brackets/quotes, splitting commas, returning 7 elements that comprise a match-level pick and ban process
def parse_picks_bans(pb):
    if pd.isna(pb):
        return [None]*7
    s = str(pb).strip()
    s = s.strip('{}[]()').replace("'", "").replace('"', '')
    parts = [p.strip() for p in re.split(r',\s*', s) if p.strip() != ""]
    parts = parts[:7] + [None]*7
    return parts[:7]

# Applies parsing, expands picks and bans into columns
_pb_parsed = matchdata['picks_and_bans'].apply(parse_picks_bans).tolist()
for i, c in enumerate(pb_cols):
    matchdata[c] = [row[i] for row in _pb_parsed]

# Define series grouping keys, build matchid counter
series_keys = ['match_datetime','team1_name','team2_name','competition_name','match_best_of','match_patch']
series_groups = matchdata.dropna(subset=['game_id']).groupby(series_keys, dropna=False)
next_match_id = 1

# For each series group: order maps by game_id, collect map ids, attach picks and bans, and subsequently create matchid
series_id_rows = []
for keys, g in series_groups:
    g2 = g.sort_values('game_id')
    game_ids = g2['game_id'].astype(int).tolist()
    map_ids = {f"Map{i}_ID": (game_ids[i-1] if len(game_ids) >= i else pd.NA) for i in range(1,6)}

    first = g2.iloc[0]
    pb_data = {c: first.get(c) for c in pb_cols}

    match_id = next_match_id
    next_match_id += 1

    series_id_rows.append({
        'MatchID': match_id,
        'series_datetime': keys[0],
        'team1_name': keys[1],
        'team2_name': keys[2],
        'competition_name': keys[3],
        'match_best_of': keys[4],
        'match_patch': keys[5],
        **map_ids,
        **pb_data
    })

series_df = pd.DataFrame(series_id_rows)

# Extract team abbreviations/full from player_stats (used for roster building)
player_stats['player_team_abbrev'] = player_stats['player_team'].astype(str).str.extract(r'^([^\(]+)')[0].str.strip()
player_stats['player_team_full']   = player_stats['player_team'].astype(str).str.extract(r'\(([^)]+)\)')[0].str.strip()

# Remove dupes, change naming convention of match_datetime
map_dt = matchdata[['game_id','match_datetime']].dropna().drop_duplicates()
map_dt = map_dt.rename(columns={'game_id':'series_id','match_datetime':'series_datetime'})

# Build per-map rosters; unique player lists per series_id that include series_datetime via merge
rosters = (
    player_stats.rename(columns={'game_id':'series_id','player_team_full':'team_full'})
    .dropna(subset=['series_id','team_full','player_handle'])
    .groupby(['series_id','team_full'])['player_handle']
    .apply(lambda x: sorted(set(map(str, x))))
    .reset_index()
    .merge(map_dt, on='series_id', how='left')
)

# Signature rules placeholder; users can add brand-specific disambiguation if desired.
SIGNATURE_RULES = {}  # Placeholder; intentionally empty for public release

def _canon_handle(s):
    return str(s).strip().lower()

# Returns corrected banners based on roster signatures (no-op if SIGNATURE_RULES empty)
def disambiguate_brand(team_full: str, roster_list):
    base = (team_full or "").strip()
    base_lc = base.lower()
    handles = {_canon_handle(h) for h in (roster_list or [])}

    # If rules are provided, resolve; otherwise return base unchanged
    for brand, buckets in SIGNATURE_RULES.items():
        if brand.lower() in base_lc:
            for _bucket, names in buckets.items():
                if handles & { _canon_handle(h) for h in names }:
                    return f"{brand} {_bucket.upper()}" if _bucket else brand
            return brand
    return base

# Applies the brand disambiguation at a team/player handle level
rosters['team_full_corr'] = rosters.apply(lambda r: disambiguate_brand(r['team_full'], r['player_handle']), axis=1)

# Initialize base fields for team rows, assigning team1 or team2 based on side
def _team_row_for_side(series_row, team_side: int):
    base = {
        'MatchID':          series_row['MatchID'],
        'series_datetime':  series_row['series_datetime'],
        'team1_name':       series_row['team1_name'],
        'team2_name':       series_row['team2_name'],
        'competition_name': series_row['competition_name'],
        'match_best_of':    series_row['match_best_of'],
        'match_patch':      series_row['match_patch'],
    }
    base['team'] = series_row['team1_name'] if team_side == 1 else series_row['team2_name']

    for i in range(1,6):
        base[f"Map{i}_ID"] = series_row.get(f"Map{i}_ID")

    # Attach per-map round diffs from the rd lookup, sum to round diff at a series/match level
    rd_sum = 0
    for i in range(1,6):
        mid = series_row.get(f"Map{i}_ID")
        if pd.notna(mid):
            mid = int(mid)
            rd = rd_lookup.get(mid, {}).get(team_side, pd.NA)
            base[f"Map{i}_RD"] = rd
            if pd.notna(rd):
                rd_sum += int(rd)
        else:
            base[f"Map{i}_RD"] = pd.NA
    base['SeriesRoundDiff'] = rd_sum

    # Copy pick/ban columns
    for c in pb_cols:
        base[c] = series_row.get(c)

    return base

# Expand each series into two team rows
team_rows = []
for _, s in series_df.iterrows():
    team_rows.append(_team_row_for_side(s, 1))
    team_rows.append(_team_row_for_side(s, 2))

patchpool = pd.DataFrame(team_rows)

map_series = matchdata[['game_id','match_datetime','team1_name','team2_name']].dropna()
map_series['series_id'] = map_series['game_id'].astype(int)  # Reusing game_id as series_id for roster table

# Matches team_display_name to roster entry at a given datetime; returns team_full, corrected banner, list of players
def pick_roster(series_dt, team_display_name):
    cand = rosters[rosters['series_datetime'] == series_dt]
    if cand.empty:
        return None, None, []
    tkns = set(re.findall(r'[A-Za-z0-9]+', str(team_display_name).lower()))
    def score_row(r):
        name = str(r['team_full']).lower()
        overlap = len(tkns & set(re.findall(r'[A-Za-z0-9]+', name)))
        return overlap
    cand = cand.copy()
    cand['__score'] = cand.apply(score_row, axis=1)
    cand = cand.sort_values(['__score'], ascending=False)
    row = cand.iloc[0]
    return row['team_full'], row['team_full_corr'], row['player_handle']

picked_team_full = []
picked_team_full_corr = []
picked_roster = []

# Matches team rows to roster, adding team_full, corrected name, roster list, roster size and sorts
for _, r in patchpool.iterrows():
    tf, tfc, roster_list = pick_roster(r['series_datetime'], r['team'])
    picked_team_full.append(tf)
    picked_team_full_corr.append(tfc)
    picked_roster.append(roster_list)

patchpool['team_full_raw']  = picked_team_full
patchpool['team_full']      = picked_team_full_corr  # Corrected banner to be used downstream
patchpool['roster']         = picked_roster
patchpool['roster_size']    = patchpool['roster'].apply(lambda x: len(x) if isinstance(x, list) else 0)

# Chronological orders team rows
patchpool = patchpool.sort_values(['team_full','series_datetime','MatchID']).reset_index(drop=True)

# Containers prepped for tracking of a team's previous roster, core overlap, days since last match
prev_seen = {}
overlap_prev = []
core_ratio_prev = []
days_since_prev = []

# Compare current 5-man roster to last, recording overlap count, core ratio, days since prior match
for _, r in patchpool.iterrows():
    brand = r['team_full']
    dt    = r['series_datetime']
    rost  = r['roster'] if isinstance(r['roster'], list) else []
    rost_set = set(map(_canon_handle, rost))

    if brand in prev_seen and len(rost_set) == 5:
        last_dt, last_rost = prev_seen[brand]
        ov = len(rost_set & last_rost)
        overlap_prev.append(ov)
        core_ratio_prev.append(ov / 5.0)
        if pd.notna(dt) and pd.notna(last_dt):
            delta = (dt - last_dt).total_seconds() / (3600 * 24)
        else:
            delta = np.nan
        days_since_prev.append(delta)
    else:
        overlap_prev.append(np.nan)
        core_ratio_prev.append(np.nan)
        days_since_prev.append(np.nan)

    if len(rost_set) == 5:
        prev_seen[brand] = (dt, rost_set)

patchpool['roster_overlap_prev'] = overlap_prev
patchpool['core_ratio']          = core_ratio_prev
patchpool['days_since_prev']     = days_since_prev

ordered_cols = [
    'MatchID','team','team1_name','team2_name','series_datetime',
    'competition_name','match_best_of','match_patch',
    'SeriesRoundDiff',
    'Map1_ID','Map2_ID','Map3_ID','Map4_ID','Map5_ID',
    'Map1_RD','Map2_RD','Map3_RD','Map4_RD','Map5_RD',
    *pb_cols,
    'team_full_raw','team_full','roster','roster_size',
    'roster_overlap_prev','core_ratio','days_since_prev'
]

# Finalizes column order and trims df, handling missing expected columns
final_cols = [c for c in ordered_cols if c in patchpool.columns]
patchpool = patchpool[final_cols].copy()

patchpool.to_csv(OUT_PATH, index=False)

print(f"Patchpool built: {len(patchpool):,} rows")
print(f"Saved → {OUT_PATH}")
with pd.option_context("display.max_columns", None, "display.width", 160):
    print(patchpool.head(8).to_string(index=False))


Patchpool built: 10,370 rows
Saved → /Users/samharwood/Downloads/vlr_patchpool_demo.csv
 MatchID              team      team1_name        team2_name           series_datetime                                    competition_name  match_best_of                                                                            match_patch  SeriesRoundDiff  Map1_ID Map2_ID Map3_ID Map4_ID Map5_ID Map1_RD Map2_RD Map3_RD Map4_RD Map5_RD picks_and_bans.1 picks_and_bans.2 picks_and_bans.3 picks_and_bans.4 picks_and_bans.5 picks_and_bans.6 picks_and_bans.7     team_full_raw         team_full                                                      roster  roster_size  roster_overlap_prev  core_ratio  days_since_prev
    5096 #1 Victory Royale         Spot Up #1 Victory Royale 2021-04-15 23:30:00+00:00 Champions Tour North America Stage 2: Challengers 2              3         Patch 2.06\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\tViper and Yoru buffs\nHRTF Audio               -7    24716   24717    <NA>    <NA>    <