# Setup 

In [1]:
import os 
import pandas as pd 
import numpy as np 

folder_path = os.environ.get("NFL_DATA_PATH") 
results_path = os.environ.get("NFL_RESULTS_PATH") 

# conversion factor for yards/second to miles/hour 
MPH_CONV = 3600 / 1760 

# Functions 

## import_playdata 

In [2]:
# import the data 
def import_playdata(weeknum):

    # read in the dataframes 
    df1 = pd.read_csv(f"{folder_path}//train//input_2023_w{weeknum:02}.csv") 
    df2 = pd.read_csv(f"{folder_path}//train//output_2023_w{weeknum:02}.csv") 

    # create the keys 
    df1["game_play_key"] = df1["game_id"].astype(str) + "-" + df1["play_id"].astype(str) 
    df2["game_play_key"] = df2["game_id"].astype(str) + "-" + df2["play_id"].astype(str) 
    df1["play_player_key"] = df1["game_play_key"] + "-" + df1["nfl_id"].astype(str) 
    df2["play_player_key"] = df2["game_play_key"] + "-" + df2["nfl_id"].astype(str) 

    # get the play metadata 
    df_meta = df1[["play_player_key", "game_play_key", "nfl_id", "player_side", "player_to_predict"]].drop_duplicates() 

    # adjust the frame ids for the output data 
    df_frames = df1.groupby("game_play_key").agg(max_frame_id = ("frame_id", "max")).reset_index() 
    df2 = df2.merge(df_frames, on = "game_play_key", how = "left") 
    df2["frame_id"] = df2["max_frame_id"] + df2["frame_id"] 
    df2 = df2.drop(columns = ["max_frame_id"]) 

    # combine the tracking data from both 
    df1["source"] = "input" 
    df2["source"] = "output" 
    df_pos = pd.concat([
        df1[["play_player_key", "frame_id", "x", "y", "source"]], 
        df2[["play_player_key", "frame_id", "x", "y", "source"]] 
    ]).sort_values(["play_player_key", "frame_id"]).reset_index(drop = True) 

    # add in the ball landing position 
    df_ball = df1[["play_player_key", "ball_land_x", "ball_land_y"]].drop_duplicates() 
    df_pos = df_pos.merge(df_ball, on = "play_player_key", how = "left") 

    return df_meta, df_pos, df1 

# # test the import function 
# df_meta, df_pos, df1 = import_playdata(1) 
# df_pos.head() 
# # .loc[df_pos["play_player_key"] == "2023091002-2127-52595"] #

## calc_distance 

In [3]:
# function to get the distance between two points from coordinates in dataframe columns
def calc_distance(df, cols1, cols2): 
    return ((df[cols1[0]] - df[cols2[0]])**2 + (df[cols1[1]] - df[cols2[1]])**2)**0.5 

## calc_top_speed 

In [4]:
def calc_top_speed(df_meta, df_pos): 

    # calculate the speed at each frame 
    df_pos["x_last"] = df_pos.groupby("play_player_key")["x"].shift(1) 
    df_pos["y_last"] = df_pos.groupby("play_player_key")["y"].shift(1) 
    df_pos["speed"] = calc_distance(df_pos, ["x", "y"], ["x_last", "y_last"]) * 10  # speed in feet per second (10 frames per second) 

    # get the top speed for each player in each play 
    df_speed = df_pos.groupby("play_player_key").agg(
        top_speed = ("speed", "max")
    ).reset_index() 

    # add the speed measurement in mph 
    df_speed["top_speed_mph"] = df_speed["top_speed"] * MPH_CONV 

    # add the metadata 
    df_speed = df_speed.merge(
        df_meta[["play_player_key", "nfl_id", "game_play_key", "player_side", "player_to_predict"]], 
        on = "play_player_key", 
        how = "left"
    ) 

    return df_speed 

# # test the function 
# df_meta, df_pos, df1 = import_playdata(1) 
# df_speed = calc_top_speed(df_meta, df_pos) 
# df_speed.head() 

## calc_defender_accel 

In [None]:
def calc_defender_accel(df_meta, df_pos): 

    # filter to just the defenders with output data 
    df_def = df_meta[(df_meta["player_side"] == "Defense") & (df_meta["player_to_predict"])][["play_player_key"]]
    df_pos = df_pos.merge(df_def[["play_player_key"]], on = "play_player_key", how = "inner") 

    # calculate the distance to the ball landing position at each frame 
    df_pos["dist_to_ball_land"] = calc_distance(df_pos, ["x", "y"], ["ball_land_x", "ball_land_y"]) 

    # calculate the acceleration to the ball at each frame 
    df_pos["dist_last"] = df_pos.groupby("play_player_key")["dist_to_ball_land"].shift(1) 
    df_pos["speed_to_ball"] = (df_pos["dist_last"] - df_pos["dist_to_ball_land"]) * 10  # speed in feet per second (10 frames per second) 
    df_pos["accel_to_ball"] = df_pos.groupby("play_player_key")["speed_to_ball"].diff() * 10  # acceleration in feet per second squared 

    # filter to just the output frames 
    df_pos = df_pos.loc[df_pos["source"] == "output"] 

    # get the first speed value for each play_player_key 
    df_initial = df_pos.groupby("play_player_key").head(1).reset_index(drop = True).rename(columns = {
        "speed_to_ball": "initial_speed_to_ball", 
        "accel_to_ball": "initial_accel_to_ball" 
    })[["play_player_key", "initial_speed_to_ball", "initial_accel_to_ball"]] 

    # get the top acceleration for each defender in each play 
    df_accel = df_pos.groupby("play_player_key").agg(
        peak_accel = ("accel_to_ball", "max") 
    ).reset_index() 

    # add the metadata 
    df_accel = df_accel.merge(
        df_meta[["play_player_key", "nfl_id", "game_play_key"]], 
        on = "play_player_key", 
        how = "left"
    ) 

    # add the initial values 
    df_accel = df_accel.merge(df_initial, on = "play_player_key", how = "left") 

    return df_accel 

# # test the function 
# df_meta, df_pos, df1 = import_playdata(1) 
# df_accel = calc_defender_accel(df_meta, df_pos) 
# df_accel.head() 

Unnamed: 0,play_player_key,peak_accel,nfl_id,game_play_key,initial_speed_to_ball,initial_accel_to_ball
0,2023090700-1001-44888,5.941749,44888,2023090700-1001,-0.603986,3.724402
1,2023090700-1001-47888,5.855379,47888,2023090700-1001,-0.67756,4.059601
2,2023090700-1001-53953,4.106192,53953,2023090700-1001,-3.783816,-3.047521
3,2023090700-1001-55910,6.517535,55910,2023090700-1001,0.48562,6.066897
4,2023090700-101-46137,5.667892,46137,2023090700-101,5.1242,3.689974


## calc_separation 

In [6]:
def calc_separation(df_meta, df_pos): 

    # get the final output positions 
    has_output = df_meta.loc[df_meta["player_to_predict"]][["play_player_key"]] 
    df_pos = df_pos.merge(has_output, on = "play_player_key", how = "inner") 
    df_pos["desc_rank"] = df_pos.groupby("play_player_key")["frame_id"].rank(method = "first", ascending = False) 
    df_final = df_pos.loc[df_pos["desc_rank"] == 1] 

    # separate the receivers and the defenders 
    df_final = df_final.merge(df_meta, on = "play_player_key", how = "left") 
    df_receivers = df_final[df_final["player_side"] == "Offense"][["game_play_key", "nfl_id", "x", "y", "ball_land_x", "ball_land_y"]].rename(columns = {
        "nfl_id": "rec_nfl_id",
        "x": "rec_x", 
        "y": "rec_y" 
    }) 
    df_defenders = df_final[df_final["player_side"] == "Defense"][["game_play_key", "nfl_id", "x", "y"]].rename(columns = {
        "nfl_id": "def_nfl_id",
        "x": "def_x", 
        "y": "def_y" 
    }) 

    # calculate the separation between each receiver and defender in the play 
    df_separation = df_receivers.merge(df_defenders, on = "game_play_key", how = "inner") 
    df_separation["separation"] = calc_distance(df_separation, ["rec_x", "rec_y"], ["def_x", "def_y"]) 

    # get the closest defender for each receiver 
    df_separation = df_separation.sort_values(["game_play_key", "rec_nfl_id", "separation"]) 
    df_separation = df_separation.groupby(["game_play_key", "rec_nfl_id"]).first().reset_index() 

    # calculate the distance from both to the ball landing position 
    df_separation["rec_ball_end_dist"] = calc_distance(df_separation, ["rec_x", "rec_y"], ["ball_land_x", "ball_land_y"])
    df_separation["def_ball_end_dist"] = calc_distance(df_separation, ["def_x", "def_y"], ["ball_land_x", "ball_land_y"]) 

    return df_separation 

# # test the function 
# df_meta, df_pos, df1 = import_playdata(1) 
# df_separation = calc_separation(df_meta, df_pos) 
# df_separation.head() 

## classify_route 

In [7]:
#speed thresholds
cutoff_speed = 5.0 

# function to classify routes based on deltas
def classify_route(row):
    dx = row['delta_x']
    dy = row['delta_y']
    sof = row['side_of_formation']
    direction = row['play_direction']
    accel = row['last_a']
    dis = row.get('furthest_x', None)  # safe get, in case it's missing

    # Short routes (Slant, Flat, Screen)
    if dx <= 3:
        if direction == 'right':
            if sof == 'right':
                if dy > 2:
                    return 'Slant'
                elif dy < -2:
                    return 'Flat' 
                else:
                    return 'Screen' 
            elif sof == 'left':
                if dy > 2:
                    return 'Flat'
                elif dy < -2:
                    return 'Slant' 
                else:
                    return 'Screen'
        if direction == 'left':
            if sof == 'right':
                if dy > 2:
                    return 'Flat' 
                elif dy < -2:
                    return 'Slant' 
                else:
                    return 'Screen'
            elif sof == 'left':
                if dy > 2:
                    return 'Slant' 
                elif dy < -2:
                    return 'Flat' 
                else:
                    return 'Screen'

    # Hitch 
    elif 3 < dx <= 7 and abs(dy) <= 2 and accel <= cutoff_speed:
        return 'Hitch'

    # Quick routes (short cross/out)
    elif 3 < dx <= 7 and abs(dy) > 2 and accel >= cutoff_speed:
        if direction == 'right':
            if sof == 'right' and dy > 0:
                return 'Quick Out'
            elif sof == 'right' and dy < 0:
                return 'Quick In'
            elif sof == 'left' and dy > 0:
                return 'Quick In'
            else:
                return 'Quick Out'
        elif direction == 'left':
            if sof == 'right' and dy > 0:
                return 'Quick In'
            elif sof == 'right' and dy < 0:
                return 'Quick Out'
            elif sof == 'left' and dy > 0:
                return 'Quick Out'
            else:
                return 'Quick In'

    # Intermediate routes (curl/comeback/in/out)
    elif 7 < dx <= 12:
        if abs(dy) >= 5 and accel >= cutoff_speed:
            # Deep In/Out classification
            if direction == 'right':
                if sof == 'right':
                    return 'Deep In' if dy > 0 else 'Deep Out'
                elif sof == 'left':
                    return 'Deep Out' if dy > 0 else 'Deep In'
            elif direction == 'left':
                if sof == 'right':
                    return 'Deep Out' if dy > 0 else 'Deep In'
                elif sof == 'left':
                    return 'Deep In' if dy > 0 else 'Deep Out'
        elif abs(dy) < 5 and accel <= cutoff_speed:
            # Curl/Comeback classification
            if direction == 'right':
                if sof == 'right':
                    return 'Curl' if dy > 0 else 'Comeback'
                elif sof == 'left':
                    return 'Comeback' if dy > 0 else 'Curl'
            elif direction == 'left':
                if sof == 'right':
                    return 'Comeback' if dy > 0 else 'Curl'
                elif sof == 'left':
                    return 'Curl' if dy > 0 else 'Comeback'

    # Deep routes (Go, Post, Corner)
    elif dx > 12:
        if abs(dy) < 5:
            return 'Go'
        elif abs(dy) >= 5:
            if direction == 'right':
                if sof == 'right':
                    return 'Post' if dy > 0 else 'Corner'
                elif sof == 'left':
                    return 'Corner' if dy > 0 else 'Post'
            elif direction == 'left':
                if sof == 'right':
                    return 'Corner' if dy > 0 else 'Post'
                elif sof == 'left':
                    return 'Post' if dy > 0 else 'Corner'

    # Default case if none of the above match
    return 'Other' 

## apply_route_classification 

In [None]:
# def apply_route_classification(df_input): 

#     # filter to just the rows we need for the classification 
#     route_df = (
#         df_input.loc[(df_input['player_side'] == 'Offense') & (df_input['frame_id'] <= 46)] 
#         [['play_player_key', 'game_play_key', 'game_id', 'play_id', 'nfl_id', "player_to_predict", 'play_direction', 'frame_id', 'x', 'y', 'a']] 
#         .sort_values(['play_player_key', 'frame_id'])
#     ) 

#     # get the QB center position at the snap for each play 
#     center = (
#         df_input[(df_input['player_position'] == 'QB') & (df_input['frame_id'] == 1)]
#         [['game_id', 'play_id', 'y']]
#         .rename(columns={'y': 'qb_y'}) 
#     ) 

#     # merge to get which side the receiver is on 
#     route_df = route_df.merge(center, on = ['game_id', 'play_id'], how = 'left') 

#     # aggregate measures to classify routes
#     route_summary = (
#         route_df.groupby(['play_player_key', "game_play_key", "nfl_id", "player_to_predict", 'play_direction', 'qb_y'], as_index=False)
#         .agg(
#             start_x=('x', 'first'),
#             end_x=('x', 'last'),
#             max_x=('x', 'max'),
#             min_x=('x', 'min'),
#             start_y=('y', 'first'),
#             end_y=('y', 'last'),
#             max_y=('y', 'max'),
#             min_y=('y', 'min'),
#             last_a=('a', 'last')
#         )
#     ) 

#     # calculate deltas and furthest_x for route classification
#     route_summary['delta_x'] = abs(route_summary['end_x'] - route_summary['start_x'])
#     route_summary['furthest_x'] = np.where(
#         route_summary['play_direction'] == 'right', 
#         route_summary['max_x'] - route_summary['start_x'], 
#         abs(route_summary['min_x'] - route_summary['start_x'])
#     )
#     route_summary['delta_y'] = route_summary['end_y'] - route_summary['start_y']

#     # determine side of formation
#     route_summary['side_of_formation'] = np.where(
#         ((route_summary['play_direction'] == 'right') & (route_summary['start_y'] > route_summary['qb_y'])), 'left',
#         np.where(
#             ((route_summary['play_direction'] == 'left') & (route_summary['start_y'] < route_summary['qb_y'])),
#             'left',
#             'right'
#         )
#     )

#     # classify the routes with our function 
#     route_summary['route'] = route_summary.apply(classify_route, axis=1) 

#     return route_summary[["play_player_key", "game_play_key", "nfl_id", "player_to_predict", "side_of_formation", "route"]]

# # # test the function 
# # df_meta, df_pos, df1 = import_playdata(1) 
# # df_routes = apply_route_classification(df1) 
# # df_routes.head() 

# Putting Everything Together 

In [7]:
# blank dataframes to append to 
df_speed = pd.DataFrame() 
df_accel = pd.DataFrame() 
df_separation = pd.DataFrame() 
df_routes = pd.DataFrame() 

# loop through each week and calculate the metrics 
for weeknum in range(1, 18):
    print(f"Processing week {weeknum}...") 
    df_meta, df_pos, df1 = import_playdata(weeknum) 

    # calculate top speed 
    new_speed = calc_top_speed(df_meta, df_pos) 
    df_speed = pd.concat([df_speed, new_speed]) 

    # calculate defender acceleration to ball 
    new_accel = calc_defender_accel(df_meta, df_pos) 
    df_accel = pd.concat([df_accel, new_accel]) 

    # calculate separation at catch 
    new_separation = calc_separation(df_meta, df_pos) 
    df_separation = pd.concat([df_separation, new_separation]) 

    # # classify routes 
    # new_routes = apply_route_classification(df1) 
    # df_routes = pd.concat([df_routes, new_routes]) 

# showcase the results 
print("Top Speed:")
display(df_speed.head()) 
print("\nDefender Acceleration to Ball:") 
display(df_accel.head()) 
print("\nSeparation at Catch:") 
display(df_separation.head()) 
# print("\nRoute Classifications:") 
# display(df_routes.head()) 

# # save the results
# df_speed.to_csv(f"{results_path}//top_speed.csv", index = False) 
# df_accel.to_csv(f"{results_path}//defender_accel.csv", index = False) 
# df_separation.to_csv(f"{results_path}//separation.csv", index = False) 

Processing week 1...
Processing week 2...
Processing week 3...
Processing week 4...
Processing week 5...
Processing week 6...
Processing week 7...
Processing week 8...
Processing week 9...
Processing week 10...
Processing week 11...
Processing week 12...
Processing week 13...
Processing week 14...
Processing week 15...
Processing week 16...
Processing week 17...
Top Speed:


Unnamed: 0,play_player_key,top_speed,top_speed_mph,nfl_id,game_play_key,player_side,player_to_predict
0,2023090700-1001-41325,6.902898,14.119564,41325,2023090700-1001,Offense,False
1,2023090700-1001-44822,2.19545,4.490693,44822,2023090700-1001,Offense,False
2,2023090700-1001-44834,6.103278,12.483977,44834,2023090700-1001,Defense,False
3,2023090700-1001-44888,3.612478,7.38916,44888,2023090700-1001,Defense,True
4,2023090700-1001-44906,3.671512,7.509911,44906,2023090700-1001,Defense,False



Defender Acceleration to Ball:


Unnamed: 0,play_player_key,avg_accel,max_accel,avg_ball_speed,max_ball_speed,nfl_id,game_play_key,initial_speed_to_ball,initial_accel_to_ball
0,2023090700-1001-44888,3.656039,5.941749,1.599743,3.410821,44888,2023090700-1001,-0.603986,3.724402
1,2023090700-1001-47888,2.714998,5.855379,1.451327,2.557664,47888,2023090700-1001,-0.67756,4.059601
2,2023090700-1001-53953,1.00509,4.106192,-3.566036,-2.272955,53953,2023090700-1001,-3.783816,-3.047521
3,2023090700-1001-55910,4.408237,6.517535,3.338272,5.168815,55910,2023090700-1001,0.48562,6.066897
4,2023090700-101-46137,1.411531,5.667892,6.909397,8.225727,46137,2023090700-101,5.1242,3.689974



Separation at Catch:


Unnamed: 0,game_play_key,rec_nfl_id,rec_x,rec_y,ball_land_x,ball_land_y,def_nfl_id,def_x,def_y,separation,rec_ball_end_dist,def_ball_end_dist
0,2023090700-1001,55920,117.57,36.21,115.610001,34.799999,55910,115.55,39.65,3.989236,2.414477,4.850372
1,2023090700-101,44930,62.49,2.83,63.259998,-0.22,46137,62.87,4.63,1.839674,3.145695,4.865655
2,2023090700-1069,53959,80.19,28.88,82.360001,29.030001,53487,76.14,25.49,5.281534,2.175179,7.156816
3,2023090700-1154,53541,51.36,15.51,52.959999,15.24,54486,48.51,13.26,3.631116,1.62262,4.870615
4,2023090700-1201,55899,48.84,13.01,51.389999,16.469999,54486,46.99,9.33,4.118847,4.29815,8.38687


## Aggregate Route Combinations 

In [8]:
# # summarize route combinations on each side of the formation 
# df_combos = df_routes.groupby(["game_play_key", 'side_of_formation'], as_index=False).agg(
#     receivers = ('nfl_id', 'count'), 
#     is_target = ('player_to_predict', 'sum'), 
#     route_combo = ('route', lambda x: '-'.join(sorted(x.dropna().astype(str).unique()))) 
# ) 

# display(df_routes.loc[df_routes["game_play_key"] == "2023092403-1749"])
# display(df_combos.loc[df_combos["game_play_key"] == "2023092403-1749"]) 

# # filter to just the targeted side 
# df_combos = df_combos[df_combos["is_target"] == 1].drop(columns = ["is_target"]) 
# df_combos = df_combos.groupby("game_play_key").head(1).reset_index(drop = True) 

# # showcase the data 
# df_combos.head(100)

## Join and Save 

In [9]:
# # build out the receiver metrics 
# df_rec = (
#     df_separation[["game_play_key", "rec_nfl_id", "separation"]]
#     .rename(columns = {"rec_nfl_id": "nfl_id"}) 
#     .merge(df_speed[["game_play_key", "nfl_id", "top_speed", "top_speed_mph"]], on = ["game_play_key", "nfl_id"], how = "left")  
# ) 

# build out the defender metrics 
df_def = (
    df_separation[["game_play_key", "def_nfl_id", "separation", "rec_ball_end_dist", "def_ball_end_dist"]] 
    .rename(columns = { "def_nfl_id": "nfl_id" }) 
    .merge(
        df_accel[["game_play_key", "nfl_id", "avg_accel", "max_accel", "avg_ball_speed", "max_ball_speed", "initial_speed_to_ball", "initial_accel_to_ball"]], 
        on = ["game_play_key", "nfl_id"], how = "left"
    )  
    # .merge(df_combos, on = "game_play_key", how = "left")
) 

# save the final results 
# df_rec.to_csv(f"{results_path}//receiver_metrics.csv", index = False) 
df_def.to_csv(f"{results_path}//defender_metrics.csv", index = False) 

# # showcase the data 
# print("\nReceiver Metrics:")
# display(df_rec.head()) 
print("\nDefender Metrics:")
display(df_def.head()) 


Defender Metrics:


Unnamed: 0,game_play_key,nfl_id,separation,rec_ball_end_dist,def_ball_end_dist,avg_accel,max_accel,avg_ball_speed,max_ball_speed,initial_speed_to_ball,initial_accel_to_ball
0,2023090700-1001,55910,3.989236,2.414477,4.850372,4.408237,6.517535,3.338272,5.168815,0.48562,6.066897
1,2023090700-101,46137,1.839674,3.145695,4.865655,1.411531,5.667892,6.909397,8.225727,5.1242,3.689974
2,2023090700-1069,53487,5.281534,2.175179,7.156816,4.499647,6.149146,-0.397173,0.734686,-1.521392,4.4371
3,2023090700-1154,54486,3.631116,1.62262,4.870615,-2.259205,8.192108,-3.100187,-0.56761,-1.565896,-37.093325
4,2023090700-1201,54486,4.118847,4.29815,8.38687,3.731243,6.151817,-0.20855,1.257376,-1.231144,1.233506
