# Setup 

In [2]:
import os, time 
import pandas as pd 
import numpy as np 
import plotly.graph_objects as go 

folder_path = os.environ.get("NFL_DATA_PATH") 
results_path = os.environ.get("NFL_RESULTS_PATH") 

# Receiver Metrics 

## import_playdata 

In [14]:
# import the data 
def import_playdata(weeknum):

    # read in the dataframes 
    df1 = pd.read_csv(f"{folder_path}//train//input_2023_w{weeknum:02}.csv") 
    df2 = pd.read_csv(f"{folder_path}//train//output_2023_w{weeknum:02}.csv") 

    # create the keys 
    df1["game_play_key"] = df1["game_id"].astype(str) + "-" + df1["play_id"].astype(str) 
    df2["game_play_key"] = df2["game_id"].astype(str) + "-" + df2["play_id"].astype(str) 
    df1["play_player_key"] = df1["game_play_key"] + "-" + df1["nfl_id"].astype(str) 
    df2["play_player_key"] = df2["game_play_key"] + "-" + df2["nfl_id"].astype(str) 

    # add the end frames 
    df1_ends = df1.groupby("game_play_key").agg(end_frame = ("frame_id", "max")).reset_index() 
    df1 = df1.merge(df1_ends, on = "game_play_key", how = "left") 
    df2_ends = df2.groupby("game_play_key").agg(end_frame = ("frame_id", "max")).reset_index() 
    df2 = df2.merge(df2_ends, on = "game_play_key", how = "left") 

    # get the player metadata 
    dfp = (
        df1[["play_player_key", "player_to_predict", "player_position", "player_side", "player_role"]]
        .drop_duplicates().reset_index(drop=True)
    )

    return df1, df2, dfp

# df1, df2, dfp = import_playdata(1) 
# df2.head() 

Unnamed: 0,game_id,play_id,nfl_id,frame_id,x,y,game_play_key,play_player_key,end_frame
0,2023090700,101,46137,1,56.22,17.28,2023090700-101,2023090700-101-46137,21
1,2023090700,101,46137,2,56.63,16.88,2023090700-101,2023090700-101-46137,21
2,2023090700,101,46137,3,57.06,16.46,2023090700-101,2023090700-101-46137,21
3,2023090700,101,46137,4,57.48,16.02,2023090700-101,2023090700-101-46137,21
4,2023090700,101,46137,5,57.91,15.56,2023090700-101,2023090700-101-46137,21


## calc_separation 

In [33]:
# function to get the distance between two points from coordinates in dataframe columns
def calc_distance(df, cols1, cols2): 
    return ((df[cols1[0]] - df[cols2[0]])**2 + (df[cols1[1]] - df[cols2[1]])**2)**0.5 

# function to calculate separation at the end of each play 
def calc_separation(df2, dfp):

    # get the final positions 
    df_final = df2.loc[df2["frame_id"] == df2["end_frame"]] 
    df_final = df_final.merge(dfp, on = "play_player_key", how = "left") 

    # get the targeted receiver coordinates 
    df_target = df_final.loc[(df_final["player_to_predict"] == 1) & (df_final["player_side"] == "Offense")].rename(columns = {
        "x": "target_x", 
        "y": "target_y", 
        "nfl_id": "target_nfl_id" 
    })[["game_play_key", "target_x", "target_y", "target_nfl_id"]]

    # get the defenders coordinates 
    df_defenders = df_final.loc[df_final["player_side"] == "Defense"].rename(columns = {
        "x": "defender_x", 
        "y": "defender_y", 
        "nfl_id": "defender_nfl_id" 
    })[["game_play_key", "defender_x", "defender_y", "defender_nfl_id"]] 

    # merge the two and calculate separation 
    df_sep = df_target.merge(df_defenders, on = "game_play_key", how = "inner") 
    df_sep["separation"] = calc_distance(
        df_sep, 
        cols1 = ["target_x", "target_y"], 
        cols2 = ["defender_x", "defender_y"]
    ) 

    # find the min separation per play 
    df_sep = df_sep.groupby("game_play_key").agg(
        separation = ("separation", "min")
    ).reset_index() 

    # df_sep = df_sep[["game_play_key", "separation"]]

    return df_sep 

df1, df2, dfp = import_playdata(1) 
df_sep = calc_separation(df2, dfp) 

df_sep.loc[df_sep["game_play_key"] == "2023090700-101"]

Unnamed: 0,game_play_key,separation
1,2023090700-101,1.839674


## calc_speed 

In [34]:
def calc_speed(df1, df2, dfp):

    # filter to just the targeted receiver 
    df1 = df1.loc[df1["player_to_predict"] == 1] 
    df2 = df2.merge(dfp, on = "play_player_key", how = "left") 
    df2 = df2.loc[df2["player_to_predict"] == 1] 

    # calculate speed for the output dataframe 
    df2 = df2.sort_values(by = ["game_play_key", "frame_id"]).reset_index(drop=True) 
    df2["prev_x"] = df2.groupby("game_play_key")["x"].shift(1) 
    df2["prev_y"] = df2.groupby("game_play_key")["y"].shift(1) 
    df2["speed"] = calc_distance(
        df2, 
        cols1 = ["x", "y"], 
        cols2 = ["prev_x", "prev_y"]
    ) / 10 
    df2["speed"] = df2["speed"].fillna(0) 

    # aggregate speed for both 
    dfs1 = df1.groupby("game_play_key").agg(MAX_SPEED_BEFORE = ("s", "max")).reset_index() 
    dfs2 = df2.groupby("game_play_key").agg(MAX_SPEED_AFTER = ("speed", "max")).reset_index() 

    # merge and get the overall max 
    df_speed = dfs1.merge(dfs2, on = "game_play_key", how = "inner") 
    df_speed["MAX_SPEED_OVERALL"] = df_speed[["MAX_SPEED_BEFORE", "MAX_SPEED_AFTER"]].max(axis=1) 

    return df_speed 

## Putting Everything Together 

In [35]:
# weeknum = 1 

# loop through each week and process the data 
df_all = pd.DataFrame() 
for weeknum in range(1, 18): 
    print(f"Processing week {weeknum}") 

    # perform the calculations 
    df1, df2, dfp = import_playdata(1) 
    df_sep = calc_separation(df2, dfp) 
    df_speed = calc_speed(df1, df2, dfp) 

    # merge the final columns together 
    df = df_sep.merge(df_speed, on = "game_play_key", how = "inner") 

    # append to the overall dataframe 
    df_all = pd.concat([df_all, df], axis=0)

# write to a csv 
df_all.to_csv(f"{folder_path}//separation_metrics.csv", index=False) 

# show the final dataframe 
df_all.head() 

Processing week 1
Processing week 2
Processing week 3
Processing week 4
Processing week 5
Processing week 6
Processing week 7
Processing week 8
Processing week 9
Processing week 10
Processing week 11
Processing week 12
Processing week 13
Processing week 14
Processing week 15
Processing week 16
Processing week 17


Unnamed: 0,game_play_key,separation,MAX_SPEED_BEFORE,MAX_SPEED_AFTER,MAX_SPEED_OVERALL
0,2023090700-1001,3.989236,5.81,1.625342,5.81
1,2023090700-101,1.839674,7.9,1.24925,7.9
2,2023090700-1069,5.281534,5.1,0.655276,5.1
3,2023090700-1154,3.631116,8.2,2.027363,8.2
4,2023090700-1201,4.118847,5.05,0.891756,5.05


In [None]:
# df1.loc[df1["game_play_key"] == "2023090700-101"][["nfl_id", "player_to_predict", "player_name", "player_position"]].drop_duplicates() 

Unnamed: 0,nfl_id,player_to_predict,player_name,player_position
0,54527,False,Bryan Cook,FS
26,46137,True,Justin Reid,SS
52,52546,True,L'Jarius Sneed,CB
78,53487,False,Nick Bolton,MLB
104,54486,False,Trent McDuffie,CB
130,53541,False,Amon-Ra St. Brown,WR
156,53959,False,Brock Wright,TE
182,43290,False,Jared Goff,QB
208,44930,True,Josh Reynolds,WR


# Target Counts 

In [None]:
# # weeknum = 1 

# # loop through each week and process the data 
# df_all = pd.DataFrame() 
# for weeknum in range(1, 18): 
#     print(f"Processing week {weeknum}") 

#     # read in the data 
#     df = pd.read_csv(f"{folder_path}//train//input_2023_w{weeknum:02}.csv") 

#     # filter to just the target player with the columns we want 
#     df = (
#         df.loc[(df["player_to_predict"] == True) & (df["player_side"] == "Offense")]
#         [["game_id", "play_id", "nfl_id", "player_name"]] 
#         .drop_duplicates().reset_index(drop=True) 
#     ) 

#     # add the key 
#     df["game_play_key"] = df["game_id"].astype(str) + "-" + df["play_id"].astype(str) 

#     # append to the overall dataframe 
#     df_all = pd.concat([df_all, df], axis=0, ignore_index=True) 

# # write to a csv 
# df_all.to_csv(f"{folder_path}//target_receivers.csv", index=False) 

# # showcase the data 
# df_all.head()

Processing week 1
Processing week 2
Processing week 3
Processing week 4
Processing week 5
Processing week 6
Processing week 7
Processing week 8
Processing week 9
Processing week 10
Processing week 11
Processing week 12
Processing week 13
Processing week 14
Processing week 15
Processing week 16
Processing week 17


Unnamed: 0,game_id,play_id,nfl_id,player_name,game_play_key
0,2023090700,101,44930,Josh Reynolds,2023090700-101
1,2023090700,194,41325,Jerick McKinnon,2023090700-194
2,2023090700,219,53591,Noah Gray,2023090700-219
3,2023090700,361,38696,Marvin Jones,2023090700-361
4,2023090700,436,53541,Amon-Ra St. Brown,2023090700-436


# Combine Results 

## Player List 

In [1]:
# loop through each week and process the data 
df_players = pd.DataFrame() 
for weeknum in range(1, 18): 
    print(f"Processing week {weeknum}") 
    
    # read in the data 
    df = pd.read_csv(f"{folder_path}//train//input_2023_w{weeknum:02}.csv") 

    # subset to the player metadata 
    df = df[["nfl_id", "player_name", "player_position"]].drop_duplicates().reset_index(drop=True) 

    # append to the overall dataframe 
    df_players = (
        pd.concat([df_players, df], axis=0, ignore_index=True) 
        .drop_duplicates().reset_index(drop=True) 
    ) 

# clean up a few of the columns 
df_players["player_name_upper"] = df_players["player_name"].str.upper() 

# save to a csv 
df_players.to_csv(f"{folder_path}//player_metadata.csv", index=False) 

# showcase the data 
df_players.head() 

NameError: name 'pd' is not defined

## Add Results 

In [4]:
df_players = pd.read_csv(f"{folder_path}//player_metadata.csv") 

df_players.loc[df_players["nfl_id"] == 55925]

Unnamed: 0,nfl_id,player_name,player_position,player_name_upper
168,55925,DJ Turner II,CB,DJ TURNER II


In [5]:
# loop through each year and add it 
df_all = pd.DataFrame() 
for year in range(2010, 2024): 
    print(f"Processing year {year}") 

    # define the table link
    table_link = f"https://www.pro-football-reference.com/draft/{year}-combine.htm#combine" 

    # download the table 
    df_combine = pd.read_html(table_link)[0] 

    # add the year column 
    df_combine["draft_year"] = year 

    # filter the data 
    df_combine = (
        df_combine.loc[df_combine["40yd"].notna()] 
        [["Player", "40yd", "3Cone", "Shuttle", "draft_year"]]
    ) 

    # append to the overall dataframe 
    df_all = pd.concat([df_all, df_combine], axis = 0, ignore_index = True) 

    # add a short delay to be polite to the server 
    time.sleep(1) 

# correct the column types 
df_all["40yd"] = pd.to_numeric(df_all["40yd"], errors = "coerce") 
df_all["3Cone"] = pd.to_numeric(df_all["3Cone"], errors = "coerce")
df_all["Shuttle"] = pd.to_numeric(df_all["Shuttle"], errors = "coerce") 

# manually look up a few players 
df_lookups = pd.DataFrame([
    ["Tyreek Hill", 4.29, None, None, 2016],
    ["Adam Thielen", 4.45, None, None, 2013], 
    ["Puka Nacua", 4.57, None, None, 2023], 
    ["DJ Moore", 4.42, None, None, 2018], 
    ["Chris Godwin Jr.", 4.42, None, None, 2017], 
    ["Drake London", 4.5, None, None, 2022], 
    ["DeVonta Smith", 4.48, None, None, 2021], 
    ["DK Metcalf", 4.33, None, None, 2019], 
    ["Romeo Doubs", 4.47, None, None, 2022]
], columns = ["Player", "40yd", "3Cone", "Shuttle", "draft_year"])  

# join in the lookups 
df_all = pd.concat([df_all, df_lookups], axis = 0, ignore_index = True) 

# # correct some of the names that don't match perfectly 
# df_all["player_name"] = df_all["player_name"].apply(lambda x: x.replace)

# join in the NFL ids 
df_all["player_name_upper"] = df_all["Player"].str.upper() 
df_players = pd.read_csv(f"{folder_path}//player_metadata.csv") 
df_players["player_name_upper"] = df_players["player_name_upper"].apply(lambda x: x.replace(" II", ""))
df_all = df_all.merge(df_players, on = "player_name_upper", how = "inner") 

# get the latest stat for each player 
df_all = df_all.sort_values(["player_name_upper", "draft_year"]).drop_duplicates("player_name_upper", keep = "last") 

# group the positions 
df_all = df_all.loc[df_all["player_position"].isin(["WR", "CB", "FS", "SS"])] 
df_all["position_group"] = np.where(df_all["player_position"] == "WR", "RECEIVER", "DEFENDER") 

# rank the players by 40 times 
df_all = df_all.sort_values("40yd").reset_index(drop = True) 
df_all["40yd_rank"] = df_all.groupby("position_group").cumcount() + 1 

# save to csv 
df_all.to_csv(f"{folder_path}//combine_data.csv", index = False) 

# showcase the data 
print(len(df_all.index))
df_all.head() 

Processing year 2010
Processing year 2011
Processing year 2012
Processing year 2013
Processing year 2014
Processing year 2015
Processing year 2016
Processing year 2017
Processing year 2018
Processing year 2019
Processing year 2020
Processing year 2021
Processing year 2022
Processing year 2023
373


  df_all = pd.concat([df_all, df_lookups], axis = 0, ignore_index = True)


Unnamed: 0,Player,40yd,3Cone,Shuttle,draft_year,player_name_upper,nfl_id,player_name,player_position,position_group,40yd_rank
0,DJ Turner,4.26,,,2023,DJ TURNER,55925,DJ Turner II,CB,DEFENDER,1
1,Marquise Goodwin,4.27,,,2013,MARQUISE GOODWIN,40026,Marquise Goodwin,WR,RECEIVER,1
2,Tyquan Thornton,4.28,,,2022,TYQUAN THORNTON,54515,Tyquan Thornton,WR,RECEIVER,2
3,Rondale Moore,4.28,6.65,4.06,2021,RONDALE MOORE,53478,Rondale Moore,WR,RECEIVER,3
4,Tyreek Hill,4.29,,,2016,TYREEK HILL,43454,Tyreek Hill,WR,RECEIVER,4


In [20]:
df_all.sort_values("40yd").reset_index(drop = True).head(25)

Unnamed: 0,Player,40yd,draft_year,player_name_upper,nfl_id,player_name,player_position
0,Marquise Goodwin,4.27,2013,MARQUISE GOODWIN,40026,Marquise Goodwin,WR
1,Tyquan Thornton,4.28,2022,TYQUAN THORNTON,54515,Tyquan Thornton,WR
2,Rondale Moore,4.28,2021,RONDALE MOORE,53478,Rondale Moore,WR
3,Tyreek Hill,4.29,2016,TYREEK HILL,43454,Tyreek Hill,WR
4,Parris Campbell,4.31,2019,PARRIS CAMPBELL,47842,Parris Campbell,WR
5,Curtis Samuel,4.31,2017,CURTIS SAMUEL,44852,Curtis Samuel,WR
6,Andy Isabella,4.31,2019,ANDY ISABELLA,47845,Andy Isabella,WR
7,Mecole Hardman,4.33,2019,MECOLE HARDMAN,47839,Mecole Hardman,WR
8,Phillip Dorsett,4.33,2015,PHILLIP DORSETT,42372,Phillip Dorsett,WR
9,DK Metcalf,4.33,2019,DK METCALF,47847,DK Metcalf,WR
