In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np 
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from imblearn.metrics import classification_report_imbalanced
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.ensemble import EasyEnsembleClassifier
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import ClusterCentroids
from imblearn.combine import SMOTEENN

In [4]:
# loading the data in 

file_path = Path("Resources/clean_nfl_data.csv")

df = pd.read_csv(file_path)

df

Unnamed: 0.1,Unnamed: 0,play_id,home_team,away_team,season_type,week,posteam,posteam_type,defteam,side_of_field,...,jersey_number,id,qb_epa,xyac_epa,xyac_mean_yardage,xyac_median_yardage,xyac_success,xyac_fd,xpass,pass_oe
0,2,55,TEN,ARI,REG,1,TEN,home,ARI,TEN,...,22.0,00-0032764,-1.399805,,,,,,0.491433,-49.143300
1,3,76,TEN,ARI,REG,1,TEN,home,ARI,TEN,...,17.0,00-0029701,0.032412,1.165133,5.803177,4.0,0.896654,0.125098,0.697346,30.265415
2,4,100,TEN,ARI,REG,1,TEN,home,ARI,TEN,...,17.0,00-0029701,-1.532898,0.256036,4.147637,2.0,0.965009,0.965009,0.978253,2.174652
3,6,152,TEN,ARI,REG,1,ARI,away,TEN,ARI,...,1.0,00-0035228,2.692890,0.567838,7.420427,4.0,1.000000,1.000000,0.458989,54.101130
4,7,181,TEN,ARI,REG,1,ARI,away,TEN,TEN,...,1.0,00-0035228,-1.009908,,,,,,0.419833,-41.983326
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5018,7202,1908,KC,LAC,REG,3,KC,home,LAC,LAC,...,15.0,00-0033873,-0.127200,0.719127,9.721938,9.0,0.123323,0.108665,0.974273,2.572680
5019,7209,2045,KC,LAC,REG,3,KC,home,LAC,KC,...,15.0,00-0033873,0.808641,0.211871,3.551042,1.0,1.000000,0.999365,0.482662,51.733780
5020,7210,2069,KC,LAC,REG,3,KC,home,LAC,KC,...,25.0,00-0036360,0.619626,,,,,,0.448400,-44.839950
5021,7211,2090,KC,LAC,REG,3,KC,home,LAC,KC,...,15.0,00-0033873,-0.537923,0.600601,7.770729,4.0,1.000000,1.000000,0.453813,54.618725


In [5]:
df = df.dropna(axis = 'columns' , how='any')

df = df.dropna()


In [6]:
df['play_type'] = df['play_type'].astype(str).str.replace('run' , '1')
df['play_type'] = df['play_type'].astype(str).str.replace('pass' , '0')
df['play_type'] = df['play_type'].astype('float')
df


Unnamed: 0.1,Unnamed: 0,play_id,home_team,away_team,season_type,week,posteam,posteam_type,defteam,side_of_field,...,touchdown,pass_touchdown,rush_touchdown,complete_pass,season,series,series_success,series_result,start_time,time_of_day
0,2,55,TEN,ARI,REG,1,TEN,home,ARI,TEN,...,0,0,0,0,2021,1,0,Punt,13:00:00,17:06:37
1,3,76,TEN,ARI,REG,1,TEN,home,ARI,TEN,...,0,0,0,1,2021,1,0,Punt,13:00:00,17:07:14
2,4,100,TEN,ARI,REG,1,TEN,home,ARI,TEN,...,0,0,0,0,2021,1,0,Punt,13:00:00,17:07:54
3,6,152,TEN,ARI,REG,1,ARI,away,TEN,ARI,...,0,0,0,1,2021,2,1,First down,13:00:00,17:09:19
4,7,181,TEN,ARI,REG,1,ARI,away,TEN,TEN,...,0,0,0,0,2021,3,1,First down,13:00:00,17:09:59
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5018,7202,1908,KC,LAC,REG,3,KC,home,LAC,LAC,...,0,0,0,1,2021,30,0,Field goal,13:00:00,18:20:43
5019,7209,2045,KC,LAC,REG,3,KC,home,LAC,KC,...,0,0,0,1,2021,32,1,First down,13:00:00,18:39:10
5020,7210,2069,KC,LAC,REG,3,KC,home,LAC,KC,...,0,0,0,0,2021,33,1,First down,13:00:00,18:39:40
5021,7211,2090,KC,LAC,REG,3,KC,home,LAC,KC,...,0,0,0,0,2021,34,1,First down,13:00:00,18:40:18


In [7]:
x = {'run': '1.0' , 'pass': '0.0'}
df = df.replace(x)

df.reset_index(inplace=True, drop=True)

df.head()

Unnamed: 0.1,Unnamed: 0,play_id,home_team,away_team,season_type,week,posteam,posteam_type,defteam,side_of_field,...,touchdown,pass_touchdown,rush_touchdown,complete_pass,season,series,series_success,series_result,start_time,time_of_day
0,2,55,TEN,ARI,REG,1,TEN,home,ARI,TEN,...,0,0,0,0,2021,1,0,Punt,13:00:00,17:06:37
1,3,76,TEN,ARI,REG,1,TEN,home,ARI,TEN,...,0,0,0,1,2021,1,0,Punt,13:00:00,17:07:14
2,4,100,TEN,ARI,REG,1,TEN,home,ARI,TEN,...,0,0,0,0,2021,1,0,Punt,13:00:00,17:07:54
3,6,152,TEN,ARI,REG,1,ARI,away,TEN,ARI,...,0,0,0,1,2021,2,1,First down,13:00:00,17:09:19
4,7,181,TEN,ARI,REG,1,ARI,away,TEN,TEN,...,0,0,0,0,2021,3,1,First down,13:00:00,17:09:59


In [8]:
X = df.drop('play_type' , axis=1)

X = pd.get_dummies(X)

# Create our target
y = df['play_type']

In [9]:
X.describe()

Unnamed: 0.1,Unnamed: 0,play_id,week,yardline_100,drive,qtr,ydstogo,ydsnet,yards_gained,shotgun,...,time_of_day_3:49:24,time_of_day_3:49:32,time_of_day_3:50:02,time_of_day_3:50:17,time_of_day_3:50:44,time_of_day_3:50:59,time_of_day_3:51:30,time_of_day_3:52:40,time_of_day_3:54:18,time_of_day_3:54:46
count,5023.0,5023.0,5023.0,5023.0,5023.0,5023.0,5023.0,5023.0,5023.0,5023.0,...,5023.0,5023.0,5023.0,5023.0,5023.0,5023.0,5023.0,5023.0,5023.0,5023.0
mean,3580.6086,2061.084412,1.795541,50.914593,11.062512,2.554051,8.486363,45.364324,5.661358,0.657376,...,0.000199,0.000199,0.000199,0.000199,0.000199,0.000199,0.000199,0.000199,0.000199,0.000199
std,2084.122874,1221.906625,0.756101,24.285391,6.481453,1.132858,4.05711,28.252437,8.676368,0.474634,...,0.01411,0.01411,0.01411,0.01411,0.01411,0.01411,0.01411,0.01411,0.01411,0.01411
min,2.0,54.0,1.0,1.0,1.0,1.0,0.0,-16.0,-18.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1762.5,995.5,1.0,33.0,6.0,2.0,6.0,21.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,3567.0,2050.0,2.0,54.0,11.0,3.0,10.0,48.0,4.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,5391.5,3093.5,2.0,72.0,16.0,4.0,10.0,72.0,8.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,7212.0,4878.0,3.0,99.0,27.0,5.0,39.0,97.0,91.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [10]:
y.value_counts()

0.0    2995
1.0    2028
Name: play_type, dtype: int64

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

X_train.shape

(3767, 7135)

In [12]:

brfc = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
brfc.fit(X_train, y_train)

BalancedRandomForestClassifier(random_state=1)

In [13]:
y_pred = brfc.predict(X_test)

balanced_accuracy_score(y_test , y_pred)

1.0

In [14]:
confusion_matrix(y_test, y_pred)
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       1.00      1.00      1.00      1.00      1.00      1.00       749
        1.0       1.00      1.00      1.00      1.00      1.00      1.00       507

avg / total       1.00      1.00      1.00      1.00      1.00      1.00      1256



In [17]:
matrix_st = confusion_matrix(y_test, y_pred)

cm4_df = pd.DataFrame(matrix_st, index=["Actual Run", "Actual Pass"], columns=["Predicted Run", "Predicted Pass"])
cm4_df

Unnamed: 0,Predicted Run,Predicted Pass
Actual Run,749,0
Actual Pass,0,507


In [18]:
confusion_matrix(y_test , y_pred)

array([[749,   0],
       [  0, 507]])

#### Combination (Over and Under) Sampling

In [20]:
smote_enn = SMOTEENN(random_state=1)
X_resample4, y_resample4 = smote_enn.fit_resample(X, y)
Counter(y_resample4)

Counter({0.0: 724, 1.0: 827})

In [21]:
model = LogisticRegression(solver='lbfgs', random_state=1)

model.fit(X_resample4, y_resample4)

LogisticRegression(random_state=1)

In [22]:
y_pred_st = model.predict(X_test)

acc_score4 = balanced_accuracy_score(y_test, y_pred_st)
acc_score4

0.6326884234864105

In [23]:
matrix_st = confusion_matrix(y_test, y_pred_st)

cm4_df = pd.DataFrame(matrix_st, index=["Actual Run", "Actual Pass"], columns=["Predicted Run", "Predicted Pass"])
cm4_df

Unnamed: 0,Predicted Run,Predicted Pass
Actual Run,379,370
Actual Pass,122,385


In [24]:
print(classification_report_imbalanced(y_test, y_pred_st))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.76      0.51      0.76      0.61      0.62      0.37       749
        1.0       0.51      0.76      0.51      0.61      0.62      0.39       507

avg / total       0.66      0.61      0.66      0.61      0.62      0.38      1256

