In [1]:
import pandas as pd
import numpy as np
import os

# Input Data

In [2]:
folder_path = os.environ.get("NFL_DATA_PATH") 
file_path = f'{folder_path}/train'

csvs = os.listdir(file_path)

df_list = []

for file in csvs:
    if file.startswith('input'):
        dfs = pd.read_csv(f'{file_path}/{file}')

        df_list.append(dfs)

df_input = pd.concat(df_list, ignore_index=True)

df_input.head()


Unnamed: 0,game_id,play_id,player_to_predict,nfl_id,frame_id,play_direction,absolute_yardline_number,player_name,player_height,player_weight,...,player_role,x,y,s,a,dir,o,num_frames_output,ball_land_x,ball_land_y
0,2023090700,101,False,54527,1,right,42,Bryan Cook,6-1,210,...,Defensive Coverage,52.33,36.94,0.09,0.39,322.4,238.24,21,63.259998,-0.22
1,2023090700,101,False,54527,2,right,42,Bryan Cook,6-1,210,...,Defensive Coverage,52.33,36.94,0.04,0.61,200.89,236.05,21,63.259998,-0.22
2,2023090700,101,False,54527,3,right,42,Bryan Cook,6-1,210,...,Defensive Coverage,52.33,36.93,0.12,0.73,147.55,240.6,21,63.259998,-0.22
3,2023090700,101,False,54527,4,right,42,Bryan Cook,6-1,210,...,Defensive Coverage,52.35,36.92,0.23,0.81,131.4,244.25,21,63.259998,-0.22
4,2023090700,101,False,54527,5,right,42,Bryan Cook,6-1,210,...,Defensive Coverage,52.37,36.9,0.35,0.82,123.26,244.25,21,63.259998,-0.22


# Offensive Player DataFrame

In [3]:
#select needed columns for df
offense_df = df_input[df_input['player_side'] == 'Offense'][['nfl_id', 'player_name', 'player_height', 'player_weight', 'player_birth_date', 'player_position', 'player_side', 'player_role', 's', 'a', 'dir']].copy()

#convert height to total inches
offense_df['height_in_inches'] = offense_df['player_height'].astype(str).apply(lambda x: int(x.split('-')[0]) * 12 + int(x.split('-')[1]))

offense_df = offense_df.groupby(['nfl_id', 'player_name', 'player_position', 'player_side'], as_index=False).agg(
    player_height=('height_in_inches', 'mean'),
    player_weight=('player_weight', 'mean'),
    targets=('player_role', lambda x: (x == 'Targeted Receiver').sum()),
    routes=('player_role', lambda x: (x.isin(['Targeted Receiver', 'Other Route Runner'])).sum()),
    avg_speed=('s', 'mean'),
    avg_acceleration=('a', 'mean'),
    max_speed=('s', 'max'),
    max_acceleration=('a', 'max'),
    total_rows=('nfl_id', 'count')
)

offense_df.head()

Unnamed: 0,nfl_id,player_name,player_position,player_side,player_height,player_weight,targets,routes,avg_speed,avg_acceleration,max_speed,max_acceleration,total_rows
0,30842,Marcedes Lewis,TE,Offense,78.0,267.0,115,912,2.638827,2.023586,7.08,5.11,912
1,31446,Matt Prater,K,Offense,70.0,201.0,0,23,0.564348,0.414348,1.23,0.77,23
2,33099,Joe Flacco,QB,Offense,78.0,245.0,0,0,2.020843,1.973177,7.48,8.43,4945
3,34452,Matthew Stafford,QB,Offense,75.0,220.0,0,0,1.903507,2.14315,7.53,8.98,10835
4,34843,Brian Hoyer,QB,Offense,74.0,215.0,0,0,1.636848,1.988934,5.58,8.83,882


# Center of Play

In [4]:
center = df_input[(df_input['player_position'] == 'QB') & (df_input['frame_id'] == 1)][['game_id', 'play_id', 'y']].rename(columns={'y': 'qb_y'})

center.head()

Unnamed: 0,game_id,play_id,qb_y
182,2023090700,101,30.07
586,2023090700,194,30.15
837,2023090700,219,23.79
1330,2023090700,361,30.09
1612,2023090700,436,29.68


# Angle DataFrame

In [5]:
# function to calculate 
# Angle df creation and transformation
angle_df = df_input[(df_input['player_side'] == 'Offense') & (df_input['player_role'] == 'Targeted Receiver') & (df_input['frame_id'] <= 46)][['game_id', 'play_id', 'nfl_id', 'player_name', 'frame_id', 'play_direction', 'x', 'y', 's', 'a', 'o', 'dir']].sort_values(['game_id', 'play_id', 'nfl_id', 'player_name', 'frame_id']).copy()

angle_df = angle_df.merge(center, on=['game_id', 'play_id'], how='left')

angle_df = angle_df.groupby(['game_id', 'play_id', 'nfl_id', 'player_name', 'play_direction'])

angle_df.head()

Unnamed: 0,game_id,play_id,nfl_id,player_name,frame_id,play_direction,x,y,s,a,o,dir,qb_y
0,2023090700,101,44930,Josh Reynolds,1,right,41.03,12.17,0.00,0.00,80.97,156.35,30.07
1,2023090700,101,44930,Josh Reynolds,2,right,41.03,12.17,0.00,0.00,82.26,119.09,30.07
2,2023090700,101,44930,Josh Reynolds,3,right,41.05,12.18,0.02,0.47,83.33,65.03,30.07
3,2023090700,101,44930,Josh Reynolds,4,right,41.07,12.20,0.18,1.54,84.29,56.06,30.07
4,2023090700,101,44930,Josh Reynolds,5,right,41.11,12.22,0.57,3.09,88.21,59.41,30.07
...,...,...,...,...,...,...,...,...,...,...,...,...,...
394833,2024010713,4018,52457,Chase Claypool,1,left,51.23,19.04,0.00,0.00,277.57,168.42,29.67
394834,2024010713,4018,52457,Chase Claypool,2,left,51.22,19.04,0.00,0.00,277.57,198.49,29.67
394835,2024010713,4018,52457,Chase Claypool,3,left,51.21,19.04,0.05,1.06,278.25,267.61,29.67
394836,2024010713,4018,52457,Chase Claypool,4,left,51.19,19.04,0.26,2.01,280.01,261.89,29.67


# Route Classification

In [6]:
#filter the input dataframe to rows needed for route classification
route_df = df_input[
    (df_input['player_side'] == 'Offense') & (df_input['player_role'] == 'Targeted Receiver') & (df_input['frame_id'] <= 46)
    ][['game_id', 'play_id', 'nfl_id', 'play_direction', 'frame_id', 'x', 'y']].sort_values(
        ['game_id', 'play_id', 'nfl_id', 'frame_id']
    ).copy()

#merge qb center y position
route_df = route_df.merge(center, on=['game_id', 'play_id'], how='left')

#aggregate measures to classify routes
route_summary = (route_df.groupby(['game_id', 'play_id', 'nfl_id', 'play_direction', 'qb_y'], as_index=False)
    .agg(
        start_x=('x', 'first'),
        end_x=('x', 'last'),
        max_x=('x', 'max'),
        min_x=('x', 'min'),
        start_y=('y', 'first'),
        end_y=('y', 'last'),
        max_y=('y', 'max'),
        min_y=('y', 'min')
    )
)

#calculate deltas and furthest_x for route classification
route_summary['delta_x'] = abs(route_summary['end_x'] - route_summary['start_x'])
route_summary['furthest_x'] = np.where(route_summary['play_direction'] == 'right', route_summary['max_x'] - route_summary['start_x'], abs(route_summary['min_x'] - route_summary['start_x']))
route_summary['delta_y'] = route_summary['end_y'] - route_summary['start_y']

#determine side of formation
route_summary['side_of_formation'] = np.where(
    ((route_summary['play_direction'] == 'right') & (route_summary['start_y'] > route_summary['qb_y'])), 'left',
    np.where(
        ((route_summary['play_direction'] == 'left') & (route_summary['start_y'] < route_summary['qb_y'])),
        'left',
        'right'
    )
)

#function to classify routes based on deltas
def classify_route(row):
    dx = row['delta_x']
    dy = row['delta_y']
    sof = row['side_of_formation']
    direction = row['play_direction']
    dis = row.get('furthest_x', None)  # safe get, in case it's missing

    # Short routes (Slant, Flat, Screen)
    if dx <= 3:
        if direction == 'right':
            if sof == 'right':
                if dy > 2:
                    return 'Slant'
                elif dy < -2:
                    return 'Flat' 
                else:
                    return 'Screen' 
            elif sof == 'left':
                if dy > 2:
                    return 'Flat'
                elif dy < -2:
                    return 'Slant' 
                else:
                    return 'Screen'
        if direction == 'left':
            if sof == 'right':
                if dy > 2:
                    return 'Flat' 
                elif dy < -2:
                    return 'Slant' 
                else:
                    return 'Screen'
            elif sof == 'left':
                if dy > 2:
                    return 'Slant' 
                elif dy < -2:
                    return 'Flat' 
                else:
                    return 'Screen'

    # Hitch 
    elif 3 < dx <= 7 and abs(dy) <= 2:
        return 'Hitch'

    # Quick routes (short cross/out)
    elif 3 < dx <= 7 and abs(dy) > 2:
        if direction == 'right':
            if sof == 'right' and dy > 0:
                return 'Quick Out'
            elif sof == 'right' and dy < 0:
                return 'Quick In'
            elif sof == 'left' and dy > 0:
                return 'Quick In'
            else:
                return 'Quick Out'
        elif direction == 'left':
            if sof == 'right' and dy > 0:
                return 'Quick In'
            elif sof == 'right' and dy < 0:
                return 'Quick Out'
            elif sof == 'left' and dy > 0:
                return 'Quick Out'
            else:
                return 'Quick In'

    # Intermediate routes (curl/comeback/in/out)
    elif 7 < dx <= 12:
        if abs(dy) >= 5:
            # Deep In/Out classification
            if direction == 'right':
                if sof == 'right':
                    return 'Deep In' if dy > 0 else 'Deep Out'
                elif sof == 'left':
                    return 'Deep Out' if dy > 0 else 'Deep In'
            elif direction == 'left':
                if sof == 'right':
                    return 'Deep Out' if dy > 0 else 'Deep In'
                elif sof == 'left':
                    return 'Deep In' if dy > 0 else 'Deep Out'
        elif abs(dy) < 5:
            # Curl/Comeback classification
            if direction == 'right':
                if sof == 'right':
                    return 'Curl' if dy > 0 else 'Comeback'
                elif sof == 'left':
                    return 'Comeback' if dy > 0 else 'Curl'
            elif direction == 'left':
                if sof == 'right':
                    return 'Comeback' if dy > 0 else 'Curl'
                elif sof == 'left':
                    return 'Curl' if dy > 0 else 'Comeback'

    # Deep routes (Go, Post, Corner)
    elif dx > 12:
        if abs(dy) < 5:
            return 'Go'
        elif abs(dy) >= 5:
            if direction == 'right':
                if sof == 'right':
                    return 'Post' if dy > 0 else 'Corner'
                elif sof == 'left':
                    return 'Corner' if dy > 0 else 'Post'
            elif direction == 'left':
                if sof == 'right':
                    return 'Corner' if dy > 0 else 'Post'
                elif sof == 'left':
                    return 'Post' if dy > 0 else 'Corner'

    # Default case if none of the above match
    return 'Other'

route_summary['route'] = route_summary.apply(classify_route, axis=1)                    

route_summary.head()

Unnamed: 0,game_id,play_id,nfl_id,play_direction,qb_y,start_x,end_x,max_x,min_x,start_y,end_y,max_y,min_y,delta_x,furthest_x,delta_y,side_of_formation,route
0,2023090700,101,44930,right,30.07,41.03,52.43,52.43,41.03,12.17,14.14,14.37,12.17,11.4,11.4,1.97,right,Curl
1,2023090700,194,41325,left,30.15,93.33,88.98,93.98,88.98,27.85,22.23,27.93,22.23,4.35,4.35,-5.62,left,Quick In
2,2023090700,219,53591,left,23.79,80.57,75.98,80.57,75.98,10.65,10.22,10.65,10.22,4.59,4.59,-0.43,left,Hitch
3,2023090700,361,38696,right,30.09,20.65,33.55,33.55,20.65,37.9,47.95,48.35,37.9,12.9,12.9,10.05,left,Corner
4,2023090700,436,53541,right,29.68,29.46,33.67,33.67,29.46,35.92,37.8,37.97,35.92,4.21,4.21,1.88,left,Hitch


In [7]:
route_summary['route'].value_counts()

route
Quick In     2199
Hitch        1972
Go           1765
Quick Out    1687
Comeback     1491
Flat         1249
Curl          909
Deep Out      659
Post          628
Slant         499
Deep In       491
Corner        409
Screen        308
Name: count, dtype: int64

In [None]:
# create some unique keys 
route_summary["game_play_key"] = route_summary["game_id"].astype(str) + "-" + route_summary["play_id"].astype(str) 
route_summary["play_player_key"] = route_summary["game_play_key"] + "-" + route_summary["nfl_id"].astype(str)

# rank the receivers from left to right 
route_summary = route_summary.sort_values(by = ['game_play_key', 'start_y'], ascending=[True, False]) 
route_summary['position_rank'] = route_summary.groupby('game_play_key').cumcount() + 1 

# save to a csv 
route_summary.to_csv(f"{folder_path}//route_classification_results.csv", index = False) 

route_summary.head() 

Unnamed: 0,game_id,play_id,nfl_id,play_direction,qb_y,start_x,end_x,max_x,min_x,start_y,...,max_y,min_y,delta_x,furthest_x,delta_y,side_of_formation,route,game_play_key,play_player_key,position_rank
12,2023090700,1001,55920,right,23.6,107.49,117.57,117.57,107.49,29.95,...,34.84,29.95,10.08,10.08,4.89,left,Comeback,2023090700-1001,2023090700-1001-55920,1
0,2023090700,101,44930,right,30.07,41.03,52.43,52.43,41.03,12.17,...,14.37,12.17,11.4,11.4,1.97,right,Curl,2023090700-101,2023090700-101-44930,1
13,2023090700,1069,53959,left,29.69,86.47,80.51,86.47,80.51,34.43,...,35.26,31.0,5.96,5.96,-3.43,right,Quick Out,2023090700-1069,2023090700-1069-53959,1
14,2023090700,1154,53541,left,23.74,71.7,55.99,71.7,55.99,32.56,...,32.56,24.74,15.71,15.71,-7.82,right,Post,2023090700-1154,2023090700-1154-53541,1
15,2023090700,1201,55899,left,23.66,52.96,49.74,52.96,49.74,9.88,...,10.13,9.59,3.22,3.22,0.25,left,Hitch,2023090700-1201,2023090700-1201-55899,1


# Output Data

In [9]:
print(csvs)

['input_2023_w01.csv', 'input_2023_w02.csv', 'input_2023_w03.csv', 'input_2023_w04.csv', 'input_2023_w05.csv', 'input_2023_w06.csv', 'input_2023_w07.csv', 'input_2023_w08.csv', 'input_2023_w09.csv', 'input_2023_w10.csv', 'input_2023_w11.csv', 'input_2023_w12.csv', 'input_2023_w13.csv', 'input_2023_w14.csv', 'input_2023_w15.csv', 'input_2023_w16.csv', 'input_2023_w17.csv', 'input_2023_w18.csv', 'output_2023_w01.csv', 'output_2023_w02.csv', 'output_2023_w03.csv', 'output_2023_w04.csv', 'output_2023_w05.csv', 'output_2023_w06.csv', 'output_2023_w07.csv', 'output_2023_w08.csv', 'output_2023_w09.csv', 'output_2023_w10.csv', 'output_2023_w11.csv', 'output_2023_w12.csv', 'output_2023_w13.csv', 'output_2023_w14.csv', 'output_2023_w15.csv', 'output_2023_w16.csv', 'output_2023_w17.csv', 'output_2023_w18.csv']


In [10]:
df_list = []

for file in csvs:
    if file.startswith('output'):
        dfs = pd.read_csv(f'{file_path}/{file}')

        df_list.append(dfs)
        
df_output = pd.concat(df_list, ignore_index=True)
df_output.head() 

Unnamed: 0,game_id,play_id,nfl_id,frame_id,x,y
0,2023090700,101,46137,1,56.22,17.28
1,2023090700,101,46137,2,56.63,16.88
2,2023090700,101,46137,3,57.06,16.46
3,2023090700,101,46137,4,57.48,16.02
4,2023090700,101,46137,5,57.91,15.56


# Supplementary Data

In [11]:
supp_path = f'{folder_path}/supplementary_data.csv'

df_sup = pd.read_csv(supp_path)
df_sup.head()

  df_sup = pd.read_csv(supp_path)


Unnamed: 0,game_id,season,week,game_date,game_time_eastern,home_team_abbr,visitor_team_abbr,play_id,play_description,quarter,...,team_coverage_type,penalty_yards,pre_penalty_yards_gained,yards_gained,expected_points,expected_points_added,pre_snap_home_team_win_probability,pre_snap_visitor_team_win_probability,home_team_win_probability_added,visitor_team_win_probility_added
0,2023090700,2023,1,09/07/2023,20:20:00,KC,DET,3461,(10:46) (Shotgun) J.Goff pass deep left to J.R...,4,...,COVER_2_ZONE,,18,18,-0.664416,2.945847,0.834296,0.165704,-0.081149,0.081149
1,2023090700,2023,1,09/07/2023,20:20:00,KC,DET,461,(7:30) J.Goff pass short right to J.Reynolds t...,1,...,COVER_6_ZONE,,21,21,1.926131,1.345633,0.544618,0.455382,-0.029415,0.029415
2,2023090700,2023,1,09/07/2023,20:20:00,KC,DET,1940,(:09) (Shotgun) J.Goff pass incomplete deep ri...,2,...,COVER_2_ZONE,,0,0,0.281891,-0.081964,0.771994,0.228006,0.000791,-0.000791
3,2023090700,2023,1,09/07/2023,20:20:00,KC,DET,1711,"(:45) (No Huddle, Shotgun) P.Mahomes pass deep...",2,...,COVER_2_ZONE,,26,26,3.452352,2.342947,0.663187,0.336813,0.041843,-0.041843
4,2023090700,2023,1,09/07/2023,20:20:00,KC,DET,1588,(1:54) (Shotgun) P.Mahomes pass incomplete dee...,2,...,COVER_4_ZONE,,0,0,1.921525,-0.324035,0.615035,0.384965,6.1e-05,-6.1e-05


In [12]:
check_sup = df_sup[['game_id', 'play_id', 'route_of_targeted_receiver']].copy()

check = route_summary.merge(check_sup, on=['game_id', 'play_id'], how='left')

check.groupby(['route', 'route_of_targeted_receiver'], as_index=False).agg(
    route_count=('route', 'count'),
    actual_count=('route_of_targeted_receiver', 'count')
).head(100)

Unnamed: 0,route,route_of_targeted_receiver,route_count,actual_count
0,Comeback,ANGLE,72,72
1,Comeback,CORNER,92,92
2,Comeback,GO,321,321
3,Comeback,HITCH,382,382
4,Comeback,IN,86,86
...,...,...,...,...
95,Quick In,HITCH,285,285
96,Quick In,IN,62,62
97,Quick In,OUT,794,794
98,Quick In,POST,6,6
