In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import date
import missingno as msno
import pickle
from tqdm import tqdm
from imblearn.over_sampling import RandomOverSampler

from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import Lasso
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

import xgboost as xg
from sklearn.metrics import mean_squared_error

from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.datasets import make_classification
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

pd.set_option('display.max_columns',200)
pd.set_option('display.max_rows',1000)

import warnings
warnings.filterwarnings("ignore")

In [2]:
pbar = tqdm(total=9)
for i in range(1,10):
    print(f"Week {i} -- STARTING")
    target_variable = ['tackle']
    OVERSAMPLE_FLAG = True

    df = pd.read_csv(f'../data/processed/ML_Dataset_Week_{i}.csv').drop('Unnamed: 0',axis=1)
    for col in ['SAMP_gameId','SAMP_playId','event','possessionTeam','CLOSEST_OPP_ID']:
        try:
            df = df.drop(col,axis=1)
        except:
            pass

    off_df = df[df['OFF_DEF_FLAG']=='OFF'].reset_index().drop(['index','OFF_DEF_FLAG'],axis=1)
    df = df[df['OFF_DEF_FLAG']=='DEF'].reset_index().drop(['index','OFF_DEF_FLAG'],axis=1)

    # Dummify Categoricals
    dummy_parent_cols = ['position', 'offenseFormation']
    dummy_child_cols = ['position_CB','position_DB','position_DE','position_DT','position_FS','position_ILB',
                        'position_MLB','position_NT','position_OLB','position_SS','offenseFormation_EMPTY',
                        'offenseFormation_I_FORM','offenseFormation_JUMBO','offenseFormation_PISTOL',
                        'offenseFormation_SHOTGUN','offenseFormation_SINGLEBACK','offenseFormation_WILDCAT']

    for col in dummy_child_cols:
        for prnt_col in dummy_parent_cols:
            if prnt_col in col:
                for idx,row in df.iterrows():
                    if row[prnt_col] == str(col.split('_')[1]):
                        df.at[idx,col] = 1
                    else:
                        df.at[idx,col] = 0

    df = df.drop(dummy_parent_cols,axis=1)

    # INFO DF
    info_df = df[['gameId','playId','nflId','displayName','frameId','time','jerseyNumber','club','x','y','s','a','dis','o',
                  'dir','tackle','assist','forcedFumble','pff_missedTackle']]
    off_info_df = off_df[['gameId','playId','nflId','displayName','frameId','time','jerseyNumber','club','x','y','s','a','dis','o',
                          'dir','tackle','assist','forcedFumble','pff_missedTackle']]
    # Input DF
    X = df.drop(['gameId','playId','nflId','displayName','frameId','time','jerseyNumber','club',
                 'tackle','assist','forcedFumble','pff_missedTackle','playDirection'],axis=1)

    assert X.columns.tolist() == ['x','y','s','a','dis','o','dir','weight','height','defendersInTheBox','YardsFromScoring',
                                  'quarter','down','yardsToGo','Stadium_Domed','Stadium_Turfed','Kickoff_temp',
                                  'Kickoff_dwpt','Kickoff_rhum','Kickoff_prcp','Kickoff_snow','Kickoff_wdir',
                                  'Kickoff_wspd','Kickoff_pres','DISTANCE_TO_CLOSEST_OPP','DEF_ORIENTATION',
                                  'ENGAGED_IN_BLOCK','DISTANCE_TO_BALL','OPEN_PATH_TO_BALL',
                                  'FREE_BLOCKER_W_BALL_CARRIER_FLAG','position_CB','position_DB','position_DE',
                                  'position_DT','position_FS','position_ILB','position_MLB','position_NT','position_OLB',
                                  'position_SS','offenseFormation_EMPTY','offenseFormation_I_FORM',
                                  'offenseFormation_JUMBO','offenseFormation_PISTOL','offenseFormation_SHOTGUN',
                                  'offenseFormation_SINGLEBACK','offenseFormation_WILDCAT'], f"ERROR - X.columns doesn't match expected values"

    # Load The Models
    tackle_model = pickle.load(open(f'../models/XGBdart_NFL_BDB_TACKLE_MODEL.sav', 'rb'))
    assist_model = pickle.load(open(f'../models/XGBdart_NFL_BDB_ASSIST_MODEL.sav', 'rb'))
    forced_fumble_model = pickle.load(open(f'../models/XGBdart_NFL_BDB_FORCED_FUMBLE_MODEL.sav', 'rb'))
    missed_tackle_model = pickle.load(open(f'../models/XGBdart_NFL_BDB_MISSED_TACKLE_MODEL.sav', 'rb'))

    # Make The Preds
    tackle_preds = tackle_model.predict_proba(X)
    tackle_preds = pd.DataFrame(tackle_preds)
    tackle_preds.columns = ['DEL','p(Tackle)']
    tackle_preds = tackle_preds.drop(['DEL'],axis=1)

    assist_preds = assist_model.predict_proba(X)
    assist_preds = pd.DataFrame(assist_preds)
    assist_preds.columns = ['DEL','p(Assist)']
    assist_preds = assist_preds.drop(['DEL'],axis=1)

    forced_fumble_preds = forced_fumble_model.predict_proba(X)
    forced_fumble_preds = pd.DataFrame(forced_fumble_preds)
    forced_fumble_preds.columns = ['DEL','p(Forced Fumble)']
    forced_fumble_preds = forced_fumble_preds.drop(['DEL'],axis=1)

    missed_tackle_preds = missed_tackle_model.predict_proba(X)
    missed_tackle_preds = pd.DataFrame(missed_tackle_preds)
    missed_tackle_preds.columns = ['DEL','p(Missed Tackle)']
    missed_tackle_preds = missed_tackle_preds.drop(['DEL'],axis=1)

    # Add To Info DF
    info_df['p(Tackle)'] = tackle_preds
    info_df['p(Assist)'] = assist_preds
    info_df['p(Forced Fumble)'] = forced_fumble_preds
    info_df['p(Missed Tackle)'] = missed_tackle_preds

    # Merge Back In Offensive Player Tracking Data
    for col in info_df.columns.tolist():
        if col not in off_info_df.columns.tolist():
            off_info_df[col] = ''

    info_df = pd.concat([info_df, off_info_df]).reset_index().drop('index',axis=1)

    # Serialize The Data
    info_df.to_csv(f'../data/processed/algo_output_data_week_{i}.csv')
    
    pbar.update(1)
pbar.close()

  0%|                                                                                                                                                                                     | 0/9 [00:00<?, ?it/s]

Week 1 -- STARTING


 11%|██████████████████▉                                                                                                                                                       | 1/9 [09:51<1:18:53, 591.70s/it]

Week 2 -- STARTING


 22%|█████████████████████████████████████▊                                                                                                                                    | 2/9 [19:22<1:07:36, 579.56s/it]

Week 3 -- STARTING


 33%|█████████████████████████████████████████████████████████▎                                                                                                                  | 3/9 [29:09<58:18, 583.02s/it]

Week 4 -- STARTING


 44%|████████████████████████████████████████████████████████████████████████████▍                                                                                               | 4/9 [38:57<48:44, 584.97s/it]

Week 5 -- STARTING


 56%|███████████████████████████████████████████████████████████████████████████████████████████████▌                                                                            | 5/9 [49:18<39:51, 597.92s/it]

Week 6 -- STARTING


 67%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                         | 6/9 [57:59<28:34, 571.60s/it]

Week 7 -- STARTING


 78%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                     | 7/9 [1:06:54<18:39, 559.57s/it]

Week 8 -- STARTING


 89%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                   | 8/9 [1:16:37<09:27, 567.25s/it]

Week 9 -- STARTING


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [1:24:34<00:00, 563.79s/it]


In [4]:
# NO WEATHER MODEL TRAINING (WK 1 ONLY)
i=1

target_variable = ['tackle']
OVERSAMPLE_FLAG = True

df = pd.read_csv(f'../data/processed/ML_Dataset_Week_{i}.csv').drop('Unnamed: 0',axis=1)
for col in ['SAMP_gameId','SAMP_playId','event','possessionTeam','CLOSEST_OPP_ID']:
    try:
        df = df.drop(col,axis=1)
    except:
        pass

off_df = df[df['OFF_DEF_FLAG']=='OFF'].reset_index().drop(['index','OFF_DEF_FLAG'],axis=1)
df = df[df['OFF_DEF_FLAG']=='DEF'].reset_index().drop(['index','OFF_DEF_FLAG'],axis=1)

# Dummify Categoricals
dummy_parent_cols = ['position', 'offenseFormation']
dummy_child_cols = ['position_CB','position_DB','position_DE','position_DT','position_FS','position_ILB',
                    'position_MLB','position_NT','position_OLB','position_SS','offenseFormation_EMPTY',
                    'offenseFormation_I_FORM','offenseFormation_JUMBO','offenseFormation_PISTOL',
                    'offenseFormation_SHOTGUN','offenseFormation_SINGLEBACK','offenseFormation_WILDCAT']

for col in dummy_child_cols:
    for prnt_col in dummy_parent_cols:
        if prnt_col in col:
            for idx,row in df.iterrows():
                if row[prnt_col] == str(col.split('_')[1]):
                    df.at[idx,col] = 1
                else:
                    df.at[idx,col] = 0

df = df.drop(dummy_parent_cols,axis=1)

# INFO DF
info_df = df[['gameId','playId','nflId','displayName','frameId','time','jerseyNumber','club','x','y','s','a','dis','o',
              'dir','tackle','assist','forcedFumble','pff_missedTackle']]
off_info_df = off_df[['gameId','playId','nflId','displayName','frameId','time','jerseyNumber','club','x','y','s','a','dis','o',
                      'dir','tackle','assist','forcedFumble','pff_missedTackle']]
# Input DF
X = df.drop(['gameId','playId','nflId','displayName','frameId','time','jerseyNumber','club',
             'tackle','assist','forcedFumble','pff_missedTackle','playDirection','Kickoff_temp',
                              'Kickoff_dwpt','Kickoff_rhum','Kickoff_prcp','Kickoff_snow','Kickoff_wdir',
                              'Kickoff_wspd','Kickoff_pres'],axis=1)

assert X.columns.tolist() == ['x','y','s','a','dis','o','dir','weight','height','defendersInTheBox','YardsFromScoring',
                              'quarter','down','yardsToGo','Stadium_Domed','Stadium_Turfed','DISTANCE_TO_CLOSEST_OPP','DEF_ORIENTATION',
                              'ENGAGED_IN_BLOCK','DISTANCE_TO_BALL','OPEN_PATH_TO_BALL',
                              'FREE_BLOCKER_W_BALL_CARRIER_FLAG','position_CB','position_DB','position_DE',
                              'position_DT','position_FS','position_ILB','position_MLB','position_NT','position_OLB',
                              'position_SS','offenseFormation_EMPTY','offenseFormation_I_FORM',
                              'offenseFormation_JUMBO','offenseFormation_PISTOL','offenseFormation_SHOTGUN',
                              'offenseFormation_SINGLEBACK','offenseFormation_WILDCAT'], f"ERROR - X.columns doesn't match expected values"

# Load The Models
tackle_model = pickle.load(open(f'../models/XGBdart_NFL_BDB_TACKLE_MODEL_NO_WEATHER.sav', 'rb'))
assist_model = pickle.load(open(f'../models/XGBdart_NFL_BDB_ASSIST_MODEL.sav', 'rb'))
forced_fumble_model = pickle.load(open(f'../models/XGBdart_NFL_BDB_FORCED_FUMBLE_MODEL.sav', 'rb'))
missed_tackle_model = pickle.load(open(f'../models/XGBdart_NFL_BDB_MISSED_TACKLE_MODEL.sav', 'rb'))

# Make The Preds
tackle_preds = tackle_model.predict_proba(X)
tackle_preds = pd.DataFrame(tackle_preds)
tackle_preds.columns = ['DEL','p(Tackle)']
tackle_preds = tackle_preds.drop(['DEL'],axis=1)

# Add To Info DF
info_df['p(Tackle)'] = tackle_preds


# Merge Back In Offensive Player Tracking Data
for col in info_df.columns.tolist():
    if col not in off_info_df.columns.tolist():
        off_info_df[col] = ''

info_df = pd.concat([info_df, off_info_df]).reset_index().drop('index',axis=1)

# Serialize The Data
info_df.to_csv(f'../data/processed/algo_output_NO_WEATHER_data_week_{i}.csv')