In [1]:
import pandas as pd
import yaml
import joblib
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

In [2]:
pbp = pd.read_parquet('data/pbp.parquet')
pbp['year'] = pbp['game_id'].apply(lambda x: int(x.split('_')[0]))
small = pbp.loc[pbp.year>2021].dropna(how='all',axis=1)
print(pbp.columns)

Index(['play_id', 'game_id', 'old_game_id', 'home_team', 'away_team',
       'season_type', 'week', 'posteam', 'posteam_type', 'defteam',
       ...
       'defense_players', 'n_offense', 'n_defense', 'ngs_air_yards',
       'time_to_throw', 'was_pressure', 'route', 'defense_man_zone_type',
       'defense_coverage_type', 'year'],
      dtype='object', length=393)


In [3]:
pbp.sample(2)

Unnamed: 0,play_id,game_id,old_game_id,home_team,away_team,season_type,week,posteam,posteam_type,defteam,...,defense_players,n_offense,n_defense,ngs_air_yards,time_to_throw,was_pressure,route,defense_man_zone_type,defense_coverage_type,year
418380,841.0,2018_15_DET_BUF,,BUF,DET,REG,15,DET,away,BUF,...,00-0030455;00-0034511;00-0033884;00-0030090;00...,11.0,11.0,,,,,,,2018
697737,3676.0,2024_10_NYJ_ARI,2024111009.0,ARI,NYJ,REG,10,ARI,home,NYJ,...,,,,,,,,,,2024


In [4]:
context_cols = ['play_id','game_id','home_team','away_team','game_half','posteam','side_of_field','desc']
small.loc[~small.down.isna()][context_cols]

Unnamed: 0,play_id,game_id,home_team,away_team,game_half,posteam,side_of_field,desc
572975,68.0,2022_01_BAL_NYJ,NYJ,BAL,Half1,NYJ,NYJ,(14:56) 32-Mi.Carter left end to NYJ 41 for 19...
572976,89.0,2022_01_BAL_NYJ,NYJ,BAL,Half1,NYJ,NYJ,"(14:29) (No Huddle, Shotgun) 19-J.Flacco pass ..."
572977,115.0,2022_01_BAL_NYJ,NYJ,BAL,Half1,NYJ,NYJ,(14:25) (No Huddle) 32-Mi.Carter left end to N...
572978,136.0,2022_01_BAL_NYJ,NYJ,BAL,Half1,NYJ,NYJ,"(14:01) (No Huddle, Shotgun) 19-J.Flacco pass ..."
572979,172.0,2022_01_BAL_NYJ,NYJ,BAL,Half1,NYJ,NYJ,"(13:53) 7-B.Mann punts 45 yards to BAL 19, Cen..."
...,...,...,...,...,...,...,...,...
698586,4158.0,2024_11_WAS_PHI,PHI,WAS,Half2,WAS,PHI,"(1:18) (No Huddle, Shotgun) 5-J.Daniels pass s..."
698587,4183.0,2024_11_WAS_PHI,PHI,WAS,Half2,WAS,PHI,"(:56) (No Huddle, Shotgun) 5-J.Daniels pass sh..."
698588,4217.0,2024_11_WAS_PHI,PHI,WAS,Half2,WAS,PHI,"(:34) (No Huddle, Shotgun) 5-J.Daniels pass in..."
698589,4240.0,2024_11_WAS_PHI,PHI,WAS,Half2,WAS,PHI,"(:31) (No Huddle, Shotgun) 5-J.Daniels pass sh..."


In [5]:
[x for x in small.columns if 'play_clock' in x.lower()]

['play_clock']

In [6]:
small[['desc', 'time',
 'home_timeouts_remaining',
 'away_timeouts_remaining',
 'timeout',
 'timeout_team',
 'posteam_timeouts_remaining',
 'defteam_timeouts_remaining',
 'start_time',
 'end_clock_time',
 'drive_time_of_possession',
 'time_to_throw']]

Unnamed: 0,desc,time,home_timeouts_remaining,away_timeouts_remaining,timeout,timeout_team,posteam_timeouts_remaining,defteam_timeouts_remaining,start_time,end_clock_time,drive_time_of_possession,time_to_throw
572973,GAME,15:00,3.0,3.0,,,,,"9/11/22, 13:05:56",,,
572974,9-J.Tucker kicks 68 yards from BAL 35 to NYJ -...,15:00,3.0,3.0,0.0,,3.0,3.0,"9/11/22, 13:05:56",2022-09-11T17:06:05.433Z,1:18,
572975,(14:56) 32-Mi.Carter left end to NYJ 41 for 19...,14:56,3.0,3.0,0.0,,3.0,3.0,"9/11/22, 13:05:56",2022-09-11T17:06:44.890Z,1:18,
572976,"(14:29) (No Huddle, Shotgun) 19-J.Flacco pass ...",14:29,3.0,3.0,0.0,,3.0,3.0,"9/11/22, 13:05:56",2022-09-11T17:07:08.393Z,1:18,3.07
572977,(14:25) (No Huddle) 32-Mi.Carter left end to N...,14:25,3.0,3.0,0.0,,3.0,3.0,"9/11/22, 13:05:56",2022-09-11T17:07:32.350Z,1:18,
...,...,...,...,...,...,...,...,...,...,...,...,...
698591,TWO-POINT CONVERSION ATTEMPT. 5-J.Daniels pass...,00:28,2.0,2.0,0.0,,2.0,2.0,"11/14/24, 20:15:56",,4:10,
698592,Timeout #2 by PHI at 00:28.,00:28,1.0,2.0,1.0,PHI,,,"11/14/24, 20:15:56",,,
698593,47-Z.Gonzalez kicks onside 12 yards from WAS 3...,00:28,1.0,2.0,0.0,,1.0,2.0,"11/14/24, 20:15:56",2024-11-15T04:16:56.543Z,0:28,
698594,(:27) 1-J.Hurts kneels to WAS 48 for -1 yards.,00:27,1.0,2.0,0.0,,1.0,2.0,"11/14/24, 20:15:56",2024-11-15T04:17:36.190Z,0:28,


In [7]:
play_type_mapping = {
    'field_goal': 0,
    'no_play': 1,
    'pass': 2,
    'punt': 3,
    'qb_kneel': 4,
    'qb_spike': 5,
    'run': 6,
    np.nan: 7
}

In [8]:
x_cols = [
    #'posteam',
    "yardline_100",
    "down",
    "goal_to_go",
    "ydstogo",
    #"yards_gained",
    "posteam_score",
    #"total_away_score",
    "score_differential",
    'quarter_seconds_remaining',
    'half_seconds_remaining',
    'game_seconds_remaining',
    "wind",
    "temp",
    'spread_line',
    'total_line'
]

small["play_type_enc"] = small['play_type'].map(play_type_mapping)
y_col = ["play_type_enc"]
data = small.loc[(~small.down.isna()) & (~small.rush_attempt.isna()) & ~(small.play_type.isna())][x_cols + y_col]



In [9]:
X, y = data[x_cols], data[y_col]


# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the model
model = xgb.XGBClassifier( eval_metric='mlogloss')
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')


Accuracy: 0.67


In [10]:
enc_back = {k:v for v,k in play_type_mapping.items() if k != 7}
enc_back

{0: 'field_goal',
 1: 'no_play',
 2: 'pass',
 3: 'punt',
 4: 'qb_kneel',
 5: 'qb_spike',
 6: 'run'}

In [11]:
y_test.value_counts()

play_type_enc
2.0              10227
6.0               7609
1.0               1272
3.0               1144
0.0                614
4.0                226
5.0                 38
Name: count, dtype: int64

In [12]:
pd.Series(y_pred).value_counts()

2    10954
6     8052
3     1240
0      644
4      215
1       15
5       10
Name: count, dtype: int64

In [13]:
data.apply(type).unique()

array([<class 'pandas.core.series.Series'>], dtype=object)

In [14]:
comb = X_test.copy(deep=True)
comb['y_pred'] = y_pred
comb['pred_str'] =[enc_back[x] for x in y_pred]
comb['y_act'] = y_test
comb['str_act'] = comb['y_act'].apply(lambda x: enc_back[x])
x = comb.loc[comb.down==4].groupby(['str_act', 'y_act'])['pred_str'].value_counts(normalize=True).reset_index().sort_values(by=['str_act','proportion'],ascending=False)
x['proportion'] = x['proportion'] * 100
x

Unnamed: 0,str_act,y_act,pred_str,proportion
21,run,6.0,run,68.75
22,run,6.0,pass,16.25
23,run,6.0,punt,8.75
24,run,6.0,field_goal,5.0
25,run,6.0,no_play,1.25
20,qb_kneel,4.0,run,100.0
15,punt,3.0,punt,97.202797
16,punt,3.0,pass,1.311189
17,punt,3.0,run,1.223776
18,punt,3.0,field_goal,0.174825


In [15]:
comb.loc[comb.down==4]['pred_str'].value_counts().reset_index()

Unnamed: 0,pred_str,count
0,punt,1240
1,field_goal,593
2,pass,219
3,run,180
4,no_play,9


In [16]:
joblib.dump(model, 'models/run_or_pass.joblib')

['models/run_or_pass.joblib']

In [17]:
{k:v for v,k in play_type_mapping.items()}[1]

'no_play'

In [18]:
feature_config = {
    'run_or_pass_cols':x_cols,
    'play_encoding': {k:v for v,k in play_type_mapping.items()}
}
feature_config

{'run_or_pass_cols': ['yardline_100',
  'down',
  'goal_to_go',
  'ydstogo',
  'posteam_score',
  'score_differential',
  'quarter_seconds_remaining',
  'half_seconds_remaining',
  'game_seconds_remaining',
  'wind',
  'temp',
  'spread_line',
  'total_line'],
 'play_encoding': {0: 'field_goal',
  1: 'no_play',
  2: 'pass',
  3: 'punt',
  4: 'qb_kneel',
  5: 'qb_spike',
  6: 'run',
  7: nan}}

In [19]:
with open("models/feature_config.yaml", "w") as file:
    yaml.dump(feature_config, file)

In [20]:
small.loc[small.play_type=='no_play'][['desc','first_down_penalty',
 'penalty',
 'penalty_team',
 'penalty_player_id',
 'penalty_player_name',
 'penalty_yards',
 'penalty_type',
 'drive_yards_penalized']].sample(10)

Unnamed: 0,desc,first_down_penalty,penalty,penalty_team,penalty_player_id,penalty_player_name,penalty_yards,penalty_type,drive_yards_penalized
593411,Timeout #1 by DET at 01:51.,0.0,0.0,,,,,,0.0
641908,(2:00) (Shotgun) 30-C.Hubbard right tackle to ...,0.0,1.0,CAR,00-0034281,B.Bozeman,10.0,Offensive Holding,0.0
578973,Timeout #3 by MIA at 01:57.,0.0,0.0,,,,,,15.0
575804,Timeout #1 by DAL at 03:08.,0.0,0.0,,,,,,0.0
593364,"(7:13) (Shotgun) PENALTY on DET-68-T.Decker, F...",0.0,1.0,DET,00-0032381,T.Decker,5.0,False Start,-20.0
574458,"(2:29) PENALTY on LAC-7-G.Everett, False Start...",0.0,1.0,LAC,00-0033895,G.Everett,5.0,False Start,-15.0
671859,Timeout #3 by BAL at 02:19.,0.0,0.0,,,,,,20.0
635263,(11:58) NO 7-Hill now at QB. (Shotgun) 41-A.Ka...,,,,,,,,-5.0
650319,(2:53) (Shotgun) 1-J.Hurts pass short right to...,1.0,1.0,KC,00-0036374,L.Sneed,5.0,Illegal Use of Hands,5.0
615526,Timeout #1 by WAS at 03:06.,0.0,0.0,,,,,,0.0


In [21]:
small.head(5)[['desc','down','yards_gained']]

Unnamed: 0,desc,down,yards_gained
572973,GAME,,
572974,9-J.Tucker kicks 68 yards from BAL 35 to NYJ -...,,0.0
572975,(14:56) 32-Mi.Carter left end to NYJ 41 for 19...,1.0,19.0
572976,"(14:29) (No Huddle, Shotgun) 19-J.Flacco pass ...",1.0,0.0
572977,(14:25) (No Huddle) 32-Mi.Carter left end to N...,2.0,5.0


In [22]:
[x for x in small.columns if 'rush' in x.lower()]

['total_home_rush_epa',
 'total_away_rush_epa',
 'total_home_rush_wpa',
 'total_away_rush_wpa',
 'first_down_rush',
 'rush_attempt',
 'rush_touchdown',
 'lateral_rush',
 'rusher_player_id',
 'rusher_player_name',
 'rushing_yards',
 'lateral_rusher_player_id',
 'lateral_rusher_player_name',
 'lateral_rushing_yards',
 'rusher',
 'rusher_jersey_number',
 'rush',
 'rusher_id',
 'number_of_pass_rushers']

In [23]:
small.loc[(small.rusher_player_name=='S.Barkley')
          &
          (small.goal_to_go == 1)][['week','home_team','away_team','rusher_player_name','goal_to_go','yardline_100','yards_gained', 'ydstogo', 'rush', 'rushing_yards', 'rush_touchdown']].reset_index(drop=True)

Unnamed: 0,week,home_team,away_team,rusher_player_name,goal_to_go,yardline_100,yards_gained,ydstogo,rush,rushing_yards,rush_touchdown
0,1,TEN,NYG,S.Barkley,1,4.0,4.0,4.0,1.0,4.0,1.0
1,1,TEN,NYG,S.Barkley,1,3.0,2.0,3.0,1.0,2.0,0.0
2,2,NYG,CAR,S.Barkley,1,5.0,3.0,5.0,1.0,3.0,0.0
3,5,GB,NYG,S.Barkley,1,2.0,2.0,2.0,1.0,2.0,1.0
4,6,NYG,BAL,S.Barkley,1,10.0,5.0,10.0,1.0,5.0,0.0
5,6,NYG,BAL,S.Barkley,1,1.0,1.0,1.0,1.0,1.0,1.0
6,8,SEA,NYG,S.Barkley,1,2.0,1.0,2.0,1.0,1.0,0.0
7,8,SEA,NYG,S.Barkley,1,1.0,1.0,1.0,1.0,1.0,1.0
8,10,NYG,HOU,S.Barkley,1,11.0,2.0,11.0,1.0,2.0,0.0
9,10,NYG,HOU,S.Barkley,1,4.0,2.0,4.0,1.0,2.0,0.0


In [24]:
r_data = small.loc[small.rush==1]
r_data
rx_cols = [
    #'posteam',
    "yardline_100",
    "down",
    "goal_to_go",
    "ydstogo",
    #"yards_gained",
    "posteam_score",
    "total_away_score",
    "score_differential",
    'quarter_seconds_remaining',
    'half_seconds_remaining',
    'game_seconds_remaining',
    "wind",
    "temp",
    'spread_line',
    'total_line'
]
r_data[context_cols + rx_cols]

Unnamed: 0,play_id,game_id,home_team,away_team,game_half,posteam,side_of_field,desc,yardline_100,down,...,posteam_score,total_away_score,score_differential,quarter_seconds_remaining,half_seconds_remaining,game_seconds_remaining,wind,temp,spread_line,total_line
572975,68.0,2022_01_BAL_NYJ,NYJ,BAL,Half1,NYJ,NYJ,(14:56) 32-Mi.Carter left end to NYJ 41 for 19...,78.0,1.0,...,0.0,0.0,0.0,896.0,1796.0,3596.0,,,-6.5,44.0
572977,115.0,2022_01_BAL_NYJ,NYJ,BAL,Half1,NYJ,NYJ,(14:25) (No Huddle) 32-Mi.Carter left end to N...,59.0,2.0,...,0.0,0.0,0.0,865.0,1765.0,3565.0,,,-6.5,44.0
572982,254.0,2022_01_BAL_NYJ,NYJ,BAL,Half1,BAL,BAL,(12:41) (Shotgun) 8-L.Jackson right tackle to ...,64.0,3.0,...,0.0,0.0,0.0,761.0,1661.0,3461.0,,,-6.5,44.0
572983,275.0,2022_01_BAL_NYJ,NYJ,BAL,Half1,BAL,BAL,(12:01) (Shotgun) 17-K.Drake right end to BAL ...,60.0,1.0,...,0.0,0.0,0.0,721.0,1621.0,3421.0,,,-6.5,44.0
572985,330.0,2022_01_BAL_NYJ,NYJ,BAL,Half1,BAL,BAL,(10:40) 17-K.Drake right tackle to BAL 49 for ...,52.0,3.0,...,0.0,0.0,0.0,640.0,1540.0,3340.0,,,-6.5,44.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
698569,3722.0,2024_11_WAS_PHI,PHI,WAS,Half2,PHI,WAS,(4:50) 26-S.Barkley left guard to WAS 39 for 7...,46.0,1.0,...,19.0,10.0,9.0,290.0,290.0,290.0,,,4.5,49.5
698571,3751.0,2024_11_WAS_PHI,PHI,WAS,Half2,PHI,WAS,"(4:45) 26-S.Barkley right guard for 39 yards, ...",39.0,2.0,...,19.0,10.0,9.0,285.0,285.0,285.0,,,4.5,49.5
698580,4000.0,2024_11_WAS_PHI,PHI,WAS,Half2,WAS,PHI,"(3:19) (No Huddle, Shotgun) 26-J.McNichols lef...",38.0,2.0,...,10.0,10.0,-16.0,199.0,199.0,199.0,,,4.5,49.5
698582,4052.0,2024_11_WAS_PHI,PHI,WAS,Half2,WAS,PHI,"(2:53) (No Huddle, Shotgun) 26-J.McNichols up ...",23.0,2.0,...,10.0,10.0,-16.0,173.0,173.0,173.0,,,4.5,49.5
