### Loading Libraries

In [1]:
import gc
import os

import duckdb
import numpy as np
import pandas as pd
from tqdm import tqdm

from feature_engineering.player_awareness import PlayerAwarenessCalculator
from feature_engineering.player_influence import PlayerInfluenceCalculator
from feature_engineering.voronoi_diagram import VoronoiDiagram

### Loading Helper Files

In [2]:
games = pd.read_csv('data/games.csv')
player_play = pd.read_csv('data/player_play.csv')
players = pd.read_csv('data/players.csv')
plays = pd.read_csv('data/plays.csv')

### Data Preprocessing

In [3]:
def prep(tracking_data):
    voronoi_calculator = VoronoiDiagram()
    influence_calculator = PlayerInfluenceCalculator()
    awareness_calculator = PlayerAwarenessCalculator()

    features = []
    grouped_frames = tracking_data.groupby(['gameId', 'playId', 'frameId'])

    for (gameId, playId, frameId), frame in tqdm(grouped_frames, total=len(grouped_frames), desc="Processing frames"):
        frame = frame.reset_index()

        # Separate offense and defense
        offense = frame[frame.team == 1].reset_index(drop=True)
        defense = frame[frame.team == -1].reset_index(drop=True)

        qb = offense[offense.position_QB == 1]
        if qb.empty:
            continue

        qb_x, qb_y = qb.iloc[0][['x', 'y']]
        ball_x, ball_y = frame[frame.team == 0].reset_index(drop=True).iloc[0][['x', 'y']]

        # Pre-compute Voronoi areas for the frame
        voronoi_areas = voronoi_calculator.compute_voronoi_areas(frame[['x', 'y']].values)

        # Precompute distance matrices
        offense_positions = offense[['x', 'y']].values
        defense_positions = defense[['x', 'y']].values
        distances_to_defenders = np.linalg.norm(
            offense_positions[:, None, :] - defense_positions[None, :, :], axis=2
        )
        distances_to_teammates = np.linalg.norm(
            offense_positions[:, None, :] - offense_positions[None, :, :], axis=2
        )
        np.fill_diagonal(distances_to_teammates, np.inf)  # Ignore self in teammate distances

        # Precompute influences
        defender_influences = influence_calculator.calculate_influence_batch(
            defense_positions, defense[['dir', 'o', 's']].values, offense_positions
        )
        teammate_influences = influence_calculator.calculate_influence_batch(
            offense_positions, offense[['dir', 'o', 's']].values, offense_positions
        )

        # Precompute influences
        defender_awarenesses = awareness_calculator.calculate_awareness_batch(
            defense_positions, defense[['o', 's']].values, offense_positions
        )
        teammate_awarenesses = awareness_calculator.calculate_awareness_batch(
            offense_positions, offense[['o', 's']].values, offense_positions
        )
        qb_awarenesses = awareness_calculator.calculate_awareness_batch(
            qb[['x', 'y']].values, qb[['o', 's']].values, offense_positions
        )

        # Construct features for each offensive player
        for idx, player in offense.iterrows():
            player_id = player['nflId']
            player_x, player_y = player['x'], player['y']
            player_s = player['s']
            player_dir = player['dir']

            # Closest defender and teammate
            sorted_defender_indices = np.argsort(distances_to_defenders[idx])
            closest_defender_idx = sorted_defender_indices[0]
            second_closest_defender_idx = sorted_defender_indices[1]
            closest_defender_distance = distances_to_defenders[idx, closest_defender_idx]
            second_closest_defender_distance = distances_to_defenders[idx, second_closest_defender_idx]
            closest_defender_speed = defense.iloc[closest_defender_idx]['s']
            second_closest_defender_speed = defense.iloc[second_closest_defender_idx]['s']
            closest_defender_dir = np.radians(defense.iloc[closest_defender_idx]['dir'])
            second_closest_defender_dir = np.radians(defense.iloc[second_closest_defender_idx]['dir'])
            closest_defender_awareness = awareness_calculator.calculate_awareness(
                defense.iloc[closest_defender_idx]['x'], 
                defense.iloc[closest_defender_idx]['y'], 
                defense.iloc[closest_defender_idx]['o'], 
                defense.iloc[closest_defender_idx]['s'], 
                player_x, player_y
                )
            second_closest_defender_awareness = awareness_calculator.calculate_awareness(
                defense.iloc[second_closest_defender_idx]['x'], 
                defense.iloc[second_closest_defender_idx]['y'], 
                defense.iloc[second_closest_defender_idx]['o'], 
                defense.iloc[second_closest_defender_idx]['s'], 
                player_x, player_y
                )

            sorted_teammate_indices = np.argsort(distances_to_teammates[idx])
            closest_teammate_idx = sorted_teammate_indices[0]
            second_closest_teammate_idx = sorted_teammate_indices[1]
            closest_teammate_distance = distances_to_teammates[idx, closest_teammate_idx]
            second_closest_teammate_distance = distances_to_teammates[idx, second_closest_teammate_idx]
            closest_teammate_speed = offense.iloc[closest_teammate_idx]['s']
            second_closest_teammate_speed = offense.iloc[second_closest_teammate_idx]['s']
            closest_teammate_dir = np.radians(offense.iloc[closest_teammate_idx]['dir'])
            second_closest_teammate_dir = np.radians(offense.iloc[second_closest_teammate_idx]['dir'])
            closest_teammate_awareness = awareness_calculator.calculate_awareness(
                offense.iloc[closest_teammate_idx]['x'], 
                offense.iloc[closest_teammate_idx]['y'], 
                offense.iloc[closest_teammate_idx]['o'], 
                offense.iloc[closest_teammate_idx]['s'], 
                player_x, player_y
                )
            second_closest_teammate_awareness = awareness_calculator.calculate_awareness(
                offense.iloc[second_closest_teammate_idx]['x'], 
                offense.iloc[second_closest_teammate_idx]['y'], 
                offense.iloc[second_closest_teammate_idx]['o'], 
                offense.iloc[second_closest_teammate_idx]['s'], 
                player_x, player_y
                )

            features.append({
                'gameId': gameId,
                'playId': playId,
                'frameId': frameId,
                'nflId': player_id,
                'week': player['week'],
                'voronoi_area': voronoi_areas[idx],
                'distance_to_qb': np.sqrt((player_x - qb_x)**2 + (player_y - qb_y)**2),
                'distance_to_ball': np.sqrt((player_x - ball_x)**2 + (player_y - ball_y)**2),
                'defender_influence': defender_influences[:, idx].sum(),
                'teammate_influence': teammate_influences[:, idx].sum(),
                'defender_awareness': defender_awarenesses[:, idx].sum(),
                'teammate_awareness': teammate_awarenesses[:, idx].sum(),
                'qb_awareness': qb_awarenesses[0, idx].sum(),
                'closest_defender_awareness': closest_defender_awareness,
                'second_closest_defender_awareness': second_closest_defender_awareness,
                'closest_teammate_awareness': closest_teammate_awareness,
                'second_closest_teammate_awareness': second_closest_teammate_awareness,
                'closest_defender_distance': closest_defender_distance,
                'second_closest_defender_distance': second_closest_defender_distance,
                'closest_defender_speed': closest_defender_speed,
                'second_closest_defender_speed': second_closest_defender_speed,
                'closest_teammate_distance': closest_teammate_distance,
                'second_closest_teammate_distance': second_closest_teammate_distance,
                'closest_teammate_speed': closest_teammate_speed,
                'second_closest_teammate_speed': second_closest_teammate_speed,
                'player_speed': player_s,
                'relative_x_speed_to_closest_defender': player_s * np.sin(player_dir) - closest_defender_speed * np.sin(closest_defender_dir),
                'relative_x_speed_to_second_closest_defender': player_s * np.sin(player_dir) - second_closest_defender_speed * np.sin(second_closest_defender_dir),
                'relative_y_speed_to_closest_defender': player_s * np.cos(player_dir) - closest_defender_speed * np.cos(closest_defender_dir),
                'relative_y_speed_to_second_closest_defender': player_s * np.cos(player_dir) - second_closest_defender_speed * np.cos(second_closest_defender_dir),
                'relative_x_speed_to_closest_teammate': player_s * np.sin(player_dir) - closest_teammate_speed * np.sin(closest_teammate_dir),
                'relative_x_speed_to_second_closest_teammate': player_s * np.sin(player_dir) - second_closest_teammate_speed * np.sin(second_closest_teammate_dir),
                'relative_y_speed_to_closest_teammate': player_s * np.cos(player_dir) - closest_teammate_speed * np.cos(closest_teammate_dir),
                'relative_y_speed_to_second_closest_teammate': player_s * np.cos(player_dir) - second_closest_teammate_speed * np.cos(second_closest_teammate_dir),
                'distance_to_los': player['distance_to_los'],
                'y_position': player['y'],
                "delta_x": player['delta_x'],
                "delta_y": player['delta_y'],
                "delta_s": player['delta_s'],
                "delta_a": player['delta_a'],
                "delta_o": player['delta_o'],
                "delta_dir": player['delta_dir'],
                'position_QB': player['position_QB'],
                'position_RB': player['position_RB'],
                'position_WR': player['position_WR'],
                'position_TE': player['position_TE'],
                'position_FB': player['position_FB'],
                'position_C': player['position_C'],
                'position_T': player['position_T'],
                'position_G': player['position_G'],
                'isBallCarrier': player['isBallCarrier'],
                'isReceiver': player['isReceiver'],
                'isRusher': player['isRusher'],
                'isBlocker': player['isBlocker'],
            })

    return pd.DataFrame(features)

In [4]:
def process_data(DATA_DIR, start_week, end_week):
    for i in range(start_week, end_week+1, 1):
        print(f"Processing data from week {i}.")

        # Load your data
        tracking_data = pd.read_csv(f'data/tracking_week_{i}.csv')

        # Filter data to pre-snap frames
        tracking_data = tracking_data[tracking_data['frameType'] == 'BEFORE_SNAP']

        # Flip plays to ensure all plays are from left to right
        tracking_data['x'] = tracking_data.apply(lambda row: 120 - row['x'] if row['playDirection'] == 'left' else row['x'], axis=1)
        tracking_data['dir'] = tracking_data.apply(lambda row: (- row['dir'] + 360) % 360 if row['playDirection'] == 'left' else row['dir'], axis=1)
        tracking_data['o'] = tracking_data.apply(lambda row: (- row['o'] + 360) % 360 if row['playDirection'] == 'left' else row['o'], axis=1)

        match_query = """
                    SELECT
                        td.gameId,
                        td.playId,
                        td.nflId,
                        td.frameId,
                        gs.week,
                        td.x AS x,
                        td.y AS y,
                        td.dis,
                        td.s,
                        td.a,
                        td.o,
                        td.dir,
                        ps.position,
                        CASE 
                            WHEN td.club = pl.possessionTeam THEN 1 -- OFFENSE
                            WHEN td.club = pl.defensiveTeam THEN -1 -- DEFENSE
                            ELSE 0 -- football (td.club) 
                        END AS team,
                        CASE
                            WHEN td.playDirection = 'left' THEN ((120 - pl.absoluteYardlineNumber) - td.x)
                            ELSE (pl.absoluteYardlineNumber - td.x)
                        END AS distance_to_los,
                        CASE 
                            WHEN 53.3 - td.y <= td.y - 0 THEN (53.3 - td.y)
                            ELSE td.y
                        END AS distance_to_closest_sideline,
                        (td.x / 120) - LAG(td.x / 120) OVER(PARTITION BY td.gameId, td.playId, td.nflId ORDER BY td.frameId ASC) AS delta_x,
                        (td.y / 120) - LAG(td.y / 120) OVER(PARTITION BY td.gameId, td.playId, td.nflId ORDER BY td.frameId ASC) AS delta_y,
                        (td.s) - LAG(td.s) OVER(PARTITION BY td.gameId, td.playId, td.nflId ORDER BY td.frameId ASC) AS delta_s,
                        (td.a) - LAG(td.a) OVER(PARTITION BY td.gameId, td.playId, td.nflId ORDER BY td.frameId ASC) AS delta_a,
                        (td.o) - LAG(td.o) OVER(PARTITION BY td.gameId, td.playId, td.nflId ORDER BY td.frameId ASC) AS delta_o,
                        (td.dir) - LAG(td.dir) OVER(PARTITION BY td.gameId, td.playId, td.nflId ORDER BY td.frameId ASC) AS delta_dir,
                        CASE 
                            WHEN pp.wasTargettedReceiver == 1 THEN 1
                            WHEN pp.hadRushAttempt == 1 THEN 1
                            ELSE 0
                        END AS isBallCarrier,
                        pp.wasTargettedReceiver AS isReceiver,
                        pp.hadRushAttempt AS isRusher,
                        CASE
                            WHEN blockedPlayerNFLId1 is NULL OR blockedPlayerNFLId1 == 0 THEN 0
                            ELSE 1
                        END AS isBlocker
                    FROM
                        tracking_data td
                    LEFT JOIN
                        plays pl
                    ON
                        td.gameId = pl.gameId AND td.playId = pl.playId
                    LEFT JOIN
                        players ps
                    ON
                        td.nflId = ps.nflId
                    LEFT JOIN
                        games gs
                    ON
                        td.gameId = gs.gameId
                    LEFT JOIN
                        player_play pp
                    ON
                        td.gameId = pp.gameId AND td.playId = pp.playId AND td.nflId = pp.nflId
                    """

        tracking_data = duckdb.query(match_query).df()

        # Fill NaNs with 0
        tracking_data = tracking_data.fillna(0)

        # Sort rows by game, play, frame, and player
        tracking_data = tracking_data.sort_values(by = ['gameId', 'playId', 'frameId', 'nflId'])

        # tracking_data['position_FOOTBALL'] = tracking_data['position'].apply(lambda row: 1 if row == '0' else 0).astype(int)
        tracking_data['position_QB'] = tracking_data['position'].apply(lambda row: 1 if row == 'QB' else 0).astype(int)
        tracking_data['position_RB'] = tracking_data['position'].apply(lambda row: 1 if row == 'RB' else 0).astype(int)
        tracking_data['position_FB'] = tracking_data['position'].apply(lambda row: 1 if row == 'FB' else 0).astype(int)
        tracking_data['position_WR'] = tracking_data['position'].apply(lambda row: 1 if row == 'WR' else 0).astype(int)
        tracking_data['position_TE'] = tracking_data['position'].apply(lambda row: 1 if row == 'TE' else 0).astype(int)
        tracking_data['position_C'] = tracking_data['position'].apply(lambda row: 1 if row == 'C' else 0).astype(int)
        tracking_data['position_G'] = tracking_data['position'].apply(lambda row: 1 if row == 'G' else 0).astype(int)
        tracking_data['position_T'] = tracking_data['position'].apply(lambda row: 1 if row == 'T' else 0).astype(int)
        #tracking_data['position_DT'] = tracking_data['position'].apply(lambda row: 1 if row == 'DT' else 0).astype(int)
        #tracking_data['position_DE'] = tracking_data['position'].apply(lambda row: 1 if row == 'DE' else 0).astype(int)
        #tracking_data['position_NT'] = tracking_data['position'].apply(lambda row: 1 if row == 'NT' else 0).astype(int)
        #tracking_data['position_MLB'] = tracking_data['position'].apply(lambda row: 1 if row == 'MLB' else 0).astype(int)
        #tracking_data['position_ILB'] = tracking_data['position'].apply(lambda row: 1 if row == 'ILB' else 0).astype(int)
        #tracking_data['position_OLB'] = tracking_data['position'].apply(lambda row: 1 if row == 'OLB' else 0).astype(int)
        #tracking_data['position_CB'] = tracking_data['position'].apply(lambda row: 1 if row == 'CB' else 0).astype(int)
        #tracking_data['position_FS'] = tracking_data['position'].apply(lambda row: 1 if row == 'FS' else 0).astype(int)
        #tracking_data['position_SS'] = tracking_data['position'].apply(lambda row: 1 if row == 'SS' else 0).astype(int)
        #tracking_data['position_DB'] = tracking_data['position'].apply(lambda row: 1 if row == 'DB' else 0).astype(int)

        results_df = prep(tracking_data=tracking_data)

        if not os.path.exists(DATA_DIR):
            results_df.to_csv(DATA_DIR, encoding='utf-8', mode='w', header=True, index=False)
        else:
            results_df.to_csv(DATA_DIR, encoding='utf-8', mode='a', header=False, index=False)

        del tracking_data
        del results_df
        gc.collect()

In [None]:
process_data("data/visualization_decoy_data_week_9.csv", 9, 9)