In [1]:
import pandas as pd
import numpy as np
import re

#Helper features file

PLAYER_STATS_PATH = "data/vlr_playerstats_demo.csv"
MATCH_STATS_PATH  = "data/vlr_matchstats_with_matchid_demo.csv"
OUTPUT_PATH       = "data/vlr_features_demo.csv"

player_stats = pd.read_csv(PLAYER_STATS_PATH)
match_stats  = pd.read_csv(MATCH_STATS_PATH)

#patch string conversion into numeric index
def patch_to_index(patch):
    try:
        s = str(patch)
        m = re.search(r'(\d+\.\d+)', s)
        if not m:
            return np.nan
        major, minor = map(int, m.group(1).split('.'))
        return major * 100 + minor
    except:
        return np.nan

#removes extra whitespace, showmatches classified under Spotlight
player_stats['player_team_abbrev'] = player_stats['player_team'].str.extract(r'^([^\(]+)')[0].str.strip()
player_stats['player_team_full']   = player_stats['player_team'].str.extract(r'\(([^)]+)\)')[0].str.strip()
player_stats = player_stats[~player_stats['player_team_full'].str.contains('Spotlight', na=False)].copy()

#filtering to keep only essential stats
player_stats = player_stats[[
    'game_id', 'player_handle', 'player_team_full',
    'player_agent', 'agent_category', 'kills'
]]

match_stats['match_datetime'] = pd.to_datetime(match_stats['match_datetime'], errors='coerce')

#merges player stats with match stats 
df = pd.merge(
    player_stats,
    match_stats[[
        'game_id', 'match_datetime', 'match_patch', 'map_name',
        'team1_name', 'team2_name', 'team1_map_score', 'team2_map_score',
        'competition_name'  # <-- ADDED HERE
    ]],
    on='game_id', how='left'
)

#computed features added to player-level dataset
df['patch_index'] = df['match_patch'].apply(patch_to_index)
df['total_rounds_played'] = df['team1_map_score'] + df['team2_map_score']

#drop low-round maps; less than 13 rounds is impossible except in strange edge cases that would be voided
df = df[df['total_rounds_played'] >= 13].copy()

#compute KPR after filtering
df['kills_per_round'] = df['kills'] / df['total_rounds_played']

df = df.sort_values(['player_handle', 'match_datetime']).reset_index(drop=True)

# start of a new "match block" whenever the timestamp changes within a player
is_new_block = df['match_datetime'].ne(
    df.groupby('player_handle')['match_datetime'].shift()
)

#consecutive run id per player: increments only when timestamp changes
df['_run_id'] = is_new_block.groupby(df['player_handle']).cumsum()

#block-level table: one row per (player, run_id) with that block's start time
block_map = (
    df.groupby(['player_handle', '_run_id'], as_index=False)['match_datetime']
      .first()
      .rename(columns={'match_datetime': '_block_start'})
)

#add previous block's start time (the previous distinct match timestamp)
block_map['_prev_block_start'] = block_map.groupby('player_handle')['_block_start'].shift()

#broadcast previous distinct match time back to all rows in the block
df = df.merge(
    block_map[['player_handle', '_run_id', '_prev_block_start']],
    on=['player_handle', '_run_id'],
    how='left'
)

#gap in days from previous DISTINCT match timestamp (same for all maps in the block)
df['days_since_last_match'] = (df['match_datetime'] - df['_prev_block_start']).dt.days.fillna(999).astype(int)

#cleanup helper cols
df.drop(columns=['_run_id', '_prev_block_start'], inplace=True)

#per-series, per-team rosters are built with match_datetime
series_roster = (
    df.rename(columns={'game_id': 'series_id', 'player_team_full': 'team_full'})
      .groupby(['series_id', 'team_full'])['player_handle']
      .apply(lambda lst: sorted(set(lst)))
      .reset_index(name='roster_handles')
      .merge(
          df[['game_id', 'match_datetime']].drop_duplicates()
            .rename(columns={'game_id': 'series_id', 'match_datetime': 'series_datetime'}),
          on='series_id', how='left'
      )
)
#maps sorted rosters to appropriate team name
roster_team_map = {
    tuple(sorted(row['roster_handles'])): row['team_full']
    for _, row in series_roster.iterrows()
}

#creates team_series dict that maps each team to their full, chronological series history
team_series = {
    team: grp.sort_values('series_datetime')
    for team, grp in series_roster.groupby('team_full')
}

#calculates how many days since k/5 players played together before current_date
def compute_days_since_fast(roster, current_date, k):
    if not isinstance(roster, (list, tuple)) or pd.isna(current_date):
        return pd.NA
    key = tuple(sorted(roster))
    team = roster_team_map.get(key)
    if not team or team not in team_series:
        return pd.NA
    past = team_series[team]
    past = past[past['series_datetime'] <= current_date]
    mask = past['roster_handles'].apply(lambda ph: len(set(ph).intersection(key)) >= k)
    if not mask.any():
        return 0
    first = past.loc[mask, 'series_datetime'].min()
    return (current_date - first).days

#full roster info added into each df row
df = pd.merge(
    df,
    series_roster[['series_id', 'team_full', 'roster_handles']],
    left_on=['game_id', 'player_team_full'],
    right_on=['series_id', 'team_full'],
    how='left'
)

#computes duration for roster continuity feature
for k in (3, 4, 5):
    df[f'days_since_{k}_of_5'] = df.apply(
        lambda r: compute_days_since_fast(r['roster_handles'], r['match_datetime'], k),
        axis=1
    )

#margin of victory calculated at a map level based on map scores
def margin_of_victory(row):
    if row['player_team_full'] == row['team1_name']:
        return row['team1_map_score'] - row['team2_map_score']
    elif row['player_team_full'] == row['team2_name']:
        return row['team2_map_score'] - row['team1_map_score']
    return pd.NA

df['margin_of_victory'] = df.apply(margin_of_victory, axis=1)

df['series_datetime'] = df['match_datetime']

df = df[[
    'player_handle', 'player_agent', 'agent_category', 'kills', 'series_datetime', 'match_patch', 'patch_index', 'map_name',
    'days_since_last_match', 'days_since_3_of_5', 'days_since_4_of_5', 'days_since_5_of_5',
    'margin_of_victory', 'total_rounds_played', 'kills_per_round',
    'game_id', 'player_team_full', 'team1_name', 'team2_name',
    'competition_name'  
]]

df.to_csv(OUTPUT_PATH, index=False)
print(f" Done. Saved to {OUTPUT_PATH}")

 Done. Saved to /Users/samharwood/Downloads/vlr_features_demo.csv
