# Setup 

In [5]:
import os 
import pandas as pd 
import numpy as np 

import shap 
import xgboost as xgb 
from sklearn.metrics import accuracy_score 
from sklearn.preprocessing import LabelEncoder 

# get the file paths from environment variables 
folder_path = os.environ.get("NFL_DATA_PATH") 
results_path = os.environ.get("NFL_RESULTS_PATH") 

# turn off the pandas warning 
pd.options.mode.chained_assignment = None  # default='warn' 

  from .autonotebook import tqdm as notebook_tqdm


# Import Data 

In [4]:
# load the supplementary data for each play 
df_supp = pd.read_csv(f"{folder_path}/supplementary_data.csv") 
df_supp["game_play_key"] = df_supp["game_id"].astype(str) + "-" + df_supp["play_id"].astype(str) 

# load the defender metrics 
df_def = pd.read_csv(f"{results_path}/defender_metrics.csv") 

# load the combine data 
df_combine = pd.read_csv(f"{folder_path}/combine_data.csv") 

# join the dataframes together 
df_plays = (
    df_def[["game_play_key", "nfl_id"]].merge(df_supp[[
        "game_play_key", "week", "pass_result", "route_of_targeted_receiver", 
        "down", "yards_to_go", "team_coverage_type"
    ]], on = "game_play_key", how = "left") 
    .merge(df_combine[["nfl_id", "40yd"]], on = "nfl_id", how = "inner") 
) 

# add completion classification 
df_plays["is_completion"] = np.where(df_plays["pass_result"] == "C", 1, 0) 

# showcase the data 
df_plays.head() 

  df_supp = pd.read_csv(f"{folder_path}/supplementary_data.csv")


Unnamed: 0,game_play_key,nfl_id,week,pass_result,route_of_targeted_receiver,down,yards_to_go,team_coverage_type,40yd,is_completion
0,2023090700-101,46137,1,I,CORNER,3,3,COVER_2_ZONE,4.4,0
1,2023090700-1069,53487,1,C,IN,1,10,COVER_3_ZONE,4.6,1
2,2023090700-1154,54486,1,C,CROSS,2,7,COVER_3_ZONE,4.44,1
3,2023090700-1201,54486,1,C,SLANT,2,11,COVER_2_ZONE,4.44,1
4,2023090700-1494,54486,1,I,IN,2,5,COVER_3_ZONE,4.44,0


# Completion Model 

## Data Prep 

In [6]:
# split the data into train and test 
df_train = df_plays.loc[df_plays["week"] <= 13] 
df_test = df_plays.loc[df_plays["week"] > 13] 

# encode the route_of_targeted_receiver variable 
le1 = LabelEncoder() 
df_train['route_encoded'] = le1.fit_transform(df_train['route_of_targeted_receiver']) 
df_test['route_encoded'] = le1.transform(df_test['route_of_targeted_receiver']) 

# encode the team_coverage_type variable 
le2 = LabelEncoder() 
df_train["coverage_encoded"] = le2.fit_transform(df_train["team_coverage_type"]) 
df_test["coverage_encoded"] = le2.transform(df_test["team_coverage_type"]) 

# define the variables 
predictor_vars = ['40yd', 'route_encoded', "down", "yards_to_go", 'coverage_encoded']
response_var = 'is_completion' 

# subset the train/test features and target variables 
X_train = df_train[predictor_vars]
y_train = df_train[response_var] 
X_test = df_test[predictor_vars]
y_test = df_test[response_var]

## Model Training 

In [8]:
# Initialize and train the XGBoost model 
xgb_model = xgb.XGBClassifier(
    n_estimators=100,
    max_depth=3,
    learning_rate=0.1,
    random_state=42,
    eval_metric='logloss'
)

# fit the model 
xgb_model.fit(X_train, y_train)

# Make predictions
y_pred_train = xgb_model.predict(X_train) 
y_pred_test = xgb_model.predict(X_test) 

# Evaluate model
print(f"Training Accuracy: {accuracy_score(y_train, y_pred_train):.1%}")
print(f"Test Accuracy: {accuracy_score(y_test, y_pred_test):.1%}") 

Training Accuracy: 70.6%
Test Accuracy: 67.3%


## SHAP Values 

In [9]:
# Calculate SHAP values for test data
explainer = shap.TreeExplainer(xgb_model)
shap_values = explainer.shap_values(X_test) 

# put the shap values into a dataframe 
df_shap = pd.DataFrame({
    "game_play_key": df_test["game_play_key"].values, 
    "nfl_id": df_test["nfl_id"].values, 
    "40yd": df_test["40yd"].values, 
    "route_of_targeted_receiver": df_test["route_of_targeted_receiver"].values, 
    "shap_40yd": shap_values[:,0], 
    "shap_route_encoded": shap_values[:,1] 
})

# showcase the results 
df_shap.sort_values("shap_40yd", ascending = False).head() 

Unnamed: 0,game_play_key,nfl_id,40yd,route_of_targeted_receiver,shap_40yd,shap_route_encoded
568,2023121701-737,53575,4.76,HITCH,0.791502,0.507805
1442,2023123104-3052,42391,4.78,HITCH,0.790345,0.512622
1015,2023122404-3271,42391,4.78,HITCH,0.778402,0.52218
499,2023121602-1718,46175,4.82,HITCH,0.776021,0.508414
1495,2023123107-1618,43333,5.11,HITCH,0.746752,0.52218


# Example Setups 

## 40 Time Stats 

In [None]:
# calculate the average 40 time 
avg_40yd = df_shap["40yd"].mean() 
print(f"Average 40 Yard Dash Time: {avg_40yd:.2f} seconds") 

Average 40 Yard Dash Time: 4.51 seconds


## calc_completion_prob 

In [11]:
def calc_completion_prob(game_play_key, avg_40 = None): 

    # filter to the given play 
    play_features = X_test.loc[df_test["game_play_key"] == game_play_key].to_dict(orient='records')[0] 

    # replace the 40 time with the average if provided 
    if avg_40 is not None: 
        play_features['40yd'] = avg_40 
    
    # calculate the completion probability 
    comp_prob = xgb_model.predict_proba(
        pd.DataFrame(
            [[play_features['40yd'], play_features['route_encoded']]], 
            columns = ["40yd", "route_encoded"]
        )
    )[:, 1][0] 
    
    return comp_prob 

# test the function 
print(calc_completion_prob("2023121008-174")) 
print(calc_completion_prob("2023121008-174", avg_40 = avg_40)) 

IndexError: list index out of range

## compare_scenarios 

In [None]:
def compare_scenarios(game_play_key): 

    # calculate the probabilities 
    prob_actual = calc_completion_prob(game_play_key) 
    prob_avg = calc_completion_prob(game_play_key, avg_40 = avg_40yd) 

    # get the defender attributes for the play 
    defender_id = df_test.loc[df_test["game_play_key"] == game_play_key, "nfl_id"].values[0] 
    defender_40 = df_test.loc[df_test["game_play_key"] == game_play_key, "40yd"].values[0] 

    # showcase the results 
    print(f"\nFor play {game_play_key}:") 
    print(f"  Completion probability against {defender_id}: {prob_actual:.1%} (40 time = {defender_40:.2f} seconds)")
    print(f"  Completion probability against average defender: {prob_avg:.1%} (avg 40 time = {avg_40:.2f} seconds)") 

# test the function 
compare_scenarios("2023120700-3817")


For play 2023120700-3817:
  Completion probability against 43700: 29.7% (40 time = 4.33 seconds)
  Completion probability against average defender: 50.8% (avg 40 time = 4.49 seconds)


## Demo Examples 

In [85]:
df_shap.loc[df_shap["40yd"] < 4.35].sort_values("shap_40yd", ascending = True)

Unnamed: 0,game_play_key,nfl_id,40yd,route_of_targeted_receiver,shap_40yd,shap_route_encoded
23,2023120700-3817,43700,4.33,GO,-0.361502,-1.097589
39,2023121000-2294,54622,4.33,GO,-0.361502,-1.097589
490,2023121700-2443,54622,4.33,GO,-0.361502,-1.097589
1033,2023122408-3669,54622,4.33,GO,-0.361502,-1.097589
1035,2023122408-4039,54622,4.33,GO,-0.361502,-1.097589
...,...,...,...,...,...,...
1026,2023122408-2774,47877,4.30,HITCH,0.128048,0.424772
1408,2023123109-1828,47877,4.30,IN,0.254024,-0.135958
835,2023122300-2605,37078,4.31,CORNER,0.344116,-0.044431
905,2023122403-2041,53458,4.31,CORNER,0.344116,-0.044431


In [86]:
compare_scenarios("2023120700-3817") 
compare_scenarios("2023121000-2294") 
compare_scenarios("2023121700-2443") 


For play 2023120700-3817:
  Completion probability against 43700: 29.7% (40 time = 4.33 seconds)
  Completion probability against average defender: 50.8% (avg 40 time = 4.49 seconds)

For play 2023121000-2294:
  Completion probability against 54622: 29.7% (40 time = 4.33 seconds)
  Completion probability against average defender: 50.8% (avg 40 time = 4.49 seconds)

For play 2023121700-2443:
  Completion probability against 54622: 29.7% (40 time = 4.33 seconds)
  Completion probability against average defender: 50.8% (avg 40 time = 4.49 seconds)


# Metric Comparison 

In [12]:
# # aggregate by top/not top defenders 
# df_sums = df_plays.groupby("is_top").agg(
#     total_plays = ("game_play_key", "count"), 
#     avg_top_speed = ("top_speed_mph", "mean"), 
#     avg_peak_accel = ("peak_accel", "mean"), 
#     avg_separation = ("separation", "mean"), 
#     breakup_rate = ("is_breakup", "mean"), 
#     breakups = ("is_breakup", "sum") 
# ).reset_index() 

# display(df_sums)

# # add to a dataframe 
# metrics = {
#     "top": df_sums.loc[df_sums["is_top"] == 1].to_dict(orient = "records")[0], 
#     "not": df_sums.loc[df_sums["is_top"] == 0].to_dict(orient = "records")[0] 
# } 

# # calculate the percent differences 
# pct_top_speed = (metrics["top"]["avg_top_speed"] / metrics["not"]["avg_top_speed"]) - 1 
# pct_peak_accel = (metrics["top"]["avg_peak_accel"] / metrics["not"]["avg_peak_accel"]) - 1 
# pct_separation = 1 - (metrics["top"]["avg_separation"] / metrics["not"]["avg_separation"]) 
# pct_breakup = (metrics["top"]["breakup_rate"] / metrics["not"]["breakup_rate"]) - 1 

# # showcase the metrics 
# print("\nThe top 25 fastest defenders (by 40 time) ") 
# print(f"- Have a {pct_top_speed:.2%} higher average top speed ({metrics['top']['avg_top_speed']:.2f} mph vs {metrics['not']['avg_top_speed']:.2f} mph)") 
# print(f"- Have a {pct_peak_accel:.2%} higher average peak acceleration ({metrics['top']['avg_peak_accel']:.2f} vs {metrics['not']['avg_peak_accel']:.2f})") 
# print(f"- Allow {pct_separation:.2%} less average separation ({metrics['top']['avg_separation']:.2f} yards vs {metrics['not']['avg_separation']:.2f} yards)") 
# print(f"- Have a {pct_breakup:.2%} higher pass breakup rate ({metrics['top']['breakup_rate']:.2%} vs {metrics['not']['breakup_rate']:.2%})") 