In [1]:
import pandas as pd
import yaml
import joblib
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, mean_squared_error, r2_score
import numpy as np

In [31]:

def create_model(df, x_cols, y_col):
    data = df.loc[~(df[y_col].isna())].copy(deep=True)
    X, y = data[x_cols], data[y_col]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = xgb.XGBClassifier( eval_metric='mlogloss')
    model.fit(X_train, y_train)


    y_pred = model.predict(X_test)

    # Evaluate
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Accuracy: {accuracy:.2f}')

    # Compute confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    print('Confusion Matrix:')
    display(cm)
    return model


def create_reg_model(df, x_cols, y_col):
    data = df.loc[~(df[y_col].isna())].copy(deep=True)
    X, y = data[x_cols], data[y_col]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100)
    model.fit(X_train, y_train)


    y_pred = model.predict(X_test)

    # Evaluate
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f'Mean Squared Error: {mse}')
    print(f'R-squared: {r2}')

    residuals = y_test - y_pred

    return X_test, y_pred, residuals


In [2]:
pbp = pd.read_parquet('data/pbp.parquet')
pbp['year'] = pbp['game_id'].apply(lambda x: int(x.split('_')[0]))
small = pbp.loc[pbp.year>2021].dropna(how='all',axis=1)
print(pbp.columns)

Index(['play_id', 'game_id', 'old_game_id', 'home_team', 'away_team',
       'season_type', 'week', 'posteam', 'posteam_type', 'defteam',
       ...
       'defense_players', 'n_offense', 'n_defense', 'ngs_air_yards',
       'time_to_throw', 'was_pressure', 'route', 'defense_man_zone_type',
       'defense_coverage_type', 'year'],
      dtype='object', length=393)


In [3]:
context_cols = ['play_id','game_id','home_team','away_team','game_half','posteam','side_of_field','desc']
small.loc[~small.down.isna()][context_cols]

Unnamed: 0,play_id,game_id,home_team,away_team,game_half,posteam,side_of_field,desc
572975,68.0,2022_01_BAL_NYJ,NYJ,BAL,Half1,NYJ,NYJ,(14:56) 32-Mi.Carter left end to NYJ 41 for 19...
572976,89.0,2022_01_BAL_NYJ,NYJ,BAL,Half1,NYJ,NYJ,"(14:29) (No Huddle, Shotgun) 19-J.Flacco pass ..."
572977,115.0,2022_01_BAL_NYJ,NYJ,BAL,Half1,NYJ,NYJ,(14:25) (No Huddle) 32-Mi.Carter left end to N...
572978,136.0,2022_01_BAL_NYJ,NYJ,BAL,Half1,NYJ,NYJ,"(14:01) (No Huddle, Shotgun) 19-J.Flacco pass ..."
572979,172.0,2022_01_BAL_NYJ,NYJ,BAL,Half1,NYJ,NYJ,"(13:53) 7-B.Mann punts 45 yards to BAL 19, Cen..."
...,...,...,...,...,...,...,...,...
698586,4158.0,2024_11_WAS_PHI,PHI,WAS,Half2,WAS,PHI,"(1:18) (No Huddle, Shotgun) 5-J.Daniels pass s..."
698587,4183.0,2024_11_WAS_PHI,PHI,WAS,Half2,WAS,PHI,"(:56) (No Huddle, Shotgun) 5-J.Daniels pass sh..."
698588,4217.0,2024_11_WAS_PHI,PHI,WAS,Half2,WAS,PHI,"(:34) (No Huddle, Shotgun) 5-J.Daniels pass in..."
698589,4240.0,2024_11_WAS_PHI,PHI,WAS,Half2,WAS,PHI,"(:31) (No Huddle, Shotgun) 5-J.Daniels pass sh..."


In [4]:
play_type_mapping = {
    'field_goal': 0,
    'no_play': 1,
    'pass': 2,
    'punt': 3,
    'qb_kneel': 4,
    'qb_spike': 5,
    'run': 6,
    np.nan: 7
}

In [6]:
x_cols = [
    #'posteam',
    "yardline_100",
    "down",
    "goal_to_go",
    "ydstogo",
    #"yards_gained",
    "posteam_score",
    #"total_away_score",
    "score_differential",
    'quarter_seconds_remaining',
    'half_seconds_remaining',
    'game_seconds_remaining',
    "wind",
    "temp",
    'spread_line',
    'total_line'
]

small["play_type_enc"] = small['play_type'].map(play_type_mapping)
y_col = "play_type_enc"
data = small.loc[(~small.down.isna()) & (~small.rush_attempt.isna()) & ~(small.play_type.isna())][x_cols + [y_col]]



In [29]:
rb_stat_cols = [
    "ppr_roll_RB_0",
    "ppr_roll_RB_1",
    "ppr_roll_RB_2",
    "rush_roll_RB_0",
    "rush_roll_RB_1",
    "rush_roll_RB_2",
    "carries_roll_RB_1",
    "carries_roll_RB_2",
    "carries_roll_RB_0",
]
rusher_stats = ['ppr_roll','rush_roll']

In [26]:
def get_rusher_number(row):
    for i in range(3):
        if row[f'gsis_id_RB_{i}'] == row['rusher_id']:
            return i
    return None 

def to_quantiles(data:pd.Series, bins:int=5):
    quantiles = pd.qcut(data, q=bins, labels=False)
    print(f'{bins} quantile cutoffs for {data.name}')
    display(data.quantile(np.arange(0,1,1/bins)).reset_index())
    
    return quantiles


In [27]:
rb_stats = pd.read_csv('data/rb_stats.csv',index_col=0)
comb = small.merge(rb_stats,left_on=['posteam','season','week'],
                    right_on=['team_name','season','week']
)
comb['rusher_depth_num'] = comb.apply(get_rusher_number, axis=1)
rushes = comb.loc[(~comb.rusher_depth_num.isna()) & (~comb.yards_gained.isna())].copy(deep=True)
player_stats = pd.read_parquet('data/agg/player_weekly_agg.parquet')
rushes = player_stats[['player_id','ppr_roll','rush_roll', 'week','season']].merge(rushes,left_on=['player_id', 'week','season'],right_on=['rusher_id','week','season'])
rushes['yards_quantile'] = to_quantiles(rushes['yards_gained'],4)
rushes[(x_cols + rb_stat_cols)]

4 quantile cutoffs for yards_gained


Unnamed: 0,index,yards_gained
0,0.0,-13.0
1,0.25,1.0
2,0.5,3.0
3,0.75,6.0


Unnamed: 0,yardline_100,down,goal_to_go,ydstogo,posteam_score,score_differential,quarter_seconds_remaining,half_seconds_remaining,game_seconds_remaining,wind,...,total_line,ppr_roll_RB_0,ppr_roll_RB_1,ppr_roll_RB_2,rush_roll_RB_0,rush_roll_RB_1,rush_roll_RB_2,carries_roll_RB_1,carries_roll_RB_2,carries_roll_RB_0
0,43.0,2.0,0,10.0,7.0,4.0,329.0,1229.0,3029.0,,...,43.5,14.050000,7.750000,1.233333,67.833333,35.333333,7.0,9.500000,2.000000,17.166667
1,36.0,3.0,0,3.0,7.0,4.0,292.0,1192.0,2992.0,,...,43.5,14.050000,7.750000,1.233333,67.833333,35.333333,7.0,9.500000,2.000000,17.166667
2,39.0,2.0,0,6.0,7.0,-9.0,683.0,1583.0,1583.0,,...,43.5,14.050000,7.750000,1.233333,67.833333,35.333333,7.0,9.500000,2.000000,17.166667
3,93.0,2.0,0,8.0,10.0,-13.0,155.0,1055.0,1055.0,,...,43.5,14.050000,7.750000,1.233333,67.833333,35.333333,7.0,9.500000,2.000000,17.166667
4,2.0,,0,0.0,24.0,-2.0,218.0,218.0,218.0,,...,43.5,14.050000,7.750000,1.233333,67.833333,35.333333,7.0,9.500000,2.000000,17.166667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30167,85.0,1.0,0,10.0,31.0,25.0,53.0,953.0,953.0,,...,45.5,14.983333,5.433333,1.750000,67.333333,32.166667,5.0,5.833333,1.333333,15.833333
30168,75.0,1.0,0,10.0,31.0,25.0,900.0,900.0,900.0,,...,45.5,14.983333,5.433333,1.750000,67.333333,32.166667,5.0,5.833333,1.333333,15.833333
30169,73.0,2.0,0,8.0,31.0,25.0,861.0,861.0,861.0,,...,45.5,14.983333,5.433333,1.750000,67.333333,32.166667,5.0,5.833333,1.333333,15.833333
30170,59.0,1.0,0,10.0,31.0,25.0,817.0,817.0,817.0,,...,45.5,14.983333,5.433333,1.750000,67.333333,32.166667,5.0,5.833333,1.333333,15.833333


In [32]:
choose_rusher_cols = (x_cols + rb_stat_cols)
choose_rusher_model = create_model(comb, x_cols=choose_rusher_cols, y_col='rusher_depth_num')
joblib.dump(choose_rusher_model, 'models/choose_rusher.joblib')

Accuracy: 0.75
Confusion Matrix:


array([[3300,  286,   75],
       [ 733,  980,   51],
       [ 260,  118,  235]])

['models/choose_rusher.joblib']

In [33]:
rush_yards_cols = (x_cols + rusher_stats)
rush_yards_model = create_model(rushes, x_cols=rush_yards_cols, y_col='yards_quantile')
joblib.dump(choose_rusher_model, 'models/rush_yards.joblib')

Accuracy: 0.31
Confusion Matrix:


array([[926, 377, 400, 191],
       [639, 367, 346, 172],
       [537, 312, 365, 162],
       [481, 260, 317, 183]])

['models/rush_yards.joblib']

In [34]:
feature_config = {
    'choose_rusher_cols':choose_rusher_cols,
    'rush_yards_cols':rush_yards_cols
}
feature_config

{'choose_rusher_cols': ['yardline_100',
  'down',
  'goal_to_go',
  'ydstogo',
  'posteam_score',
  'score_differential',
  'quarter_seconds_remaining',
  'half_seconds_remaining',
  'game_seconds_remaining',
  'wind',
  'temp',
  'spread_line',
  'total_line',
  'ppr_roll_RB_0',
  'ppr_roll_RB_1',
  'ppr_roll_RB_2',
  'rush_roll_RB_0',
  'rush_roll_RB_1',
  'rush_roll_RB_2',
  'carries_roll_RB_1',
  'carries_roll_RB_2',
  'carries_roll_RB_0'],
 'rush_yards_cols': ['yardline_100',
  'down',
  'goal_to_go',
  'ydstogo',
  'posteam_score',
  'score_differential',
  'quarter_seconds_remaining',
  'half_seconds_remaining',
  'game_seconds_remaining',
  'wind',
  'temp',
  'spread_line',
  'total_line',
  'ppr_roll',
  'rush_roll']}

In [35]:
yaml_path = "models/feature_config.yaml"
with open(yaml_path, 'r') as file:
    data = yaml.safe_load(file)  # Load the current content of the YAML file

# Update the data with the provided updates
data.update(feature_config)

with open(yaml_path, 'w') as file:
    yaml.safe_dump(data, file)