In [2]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np 
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from imblearn.metrics import classification_report_imbalanced
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.ensemble import EasyEnsembleClassifier
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import ClusterCentroids
from imblearn.combine import SMOTEENN

In [7]:
# loading the data in 

file_path = Path("Resources/simple_df.csv")

df = pd.read_csv(file_path)

df

Unnamed: 0.1,Unnamed: 0,posteam,play_type,down,ydstogo
0,0,TEN,run,1.0,10
1,1,TEN,pass,2.0,13
2,2,TEN,pass,3.0,10
3,3,ARI,pass,1.0,10
4,4,ARI,run,1.0,10
...,...,...,...,...,...
35761,35761,CIN,pass,1.0,10
35762,35762,CIN,pass,1.0,10
35763,35763,CIN,pass,2.0,1
35764,35764,CIN,run,3.0,1


In [5]:
len(df)

35766

In [8]:
df = df.dropna(axis = 'columns' , how='any')

df = df.dropna()


In [9]:
df['play_type'] = df['play_type'].astype(str).str.replace('run' , '1')
df['play_type'] = df['play_type'].astype(str).str.replace('pass' , '0')
df['play_type'] = df['play_type'].astype('float')
df


Unnamed: 0.1,Unnamed: 0,posteam,play_type,ydstogo
0,0,TEN,1.0,10
1,1,TEN,0.0,13
2,2,TEN,0.0,10
3,3,ARI,0.0,10
4,4,ARI,1.0,10
...,...,...,...,...
35761,35761,CIN,0.0,10
35762,35762,CIN,0.0,10
35763,35763,CIN,0.0,1
35764,35764,CIN,1.0,1


In [None]:
x = {'run': '1.0' , 'pass': '0.0'}
df = df.replace(x)

df.reset_index(inplace=True, drop=True)

df.head()

In [11]:
X = df.drop('play_type' , axis=1)

X = pd.get_dummies(X)

# Create our target
y = df['play_type']

In [None]:
X.describe()

In [None]:
y.value_counts()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

X_train.shape

In [None]:

brfc = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
brfc.fit(X_train, y_train)

In [116]:
y_pred = brfc.predict(X_test)

balanced_accuracy_score(y_test , y_pred)

NameError: name 'brfc' is not defined

In [None]:
confusion_matrix(y_test, y_pred)
print(classification_report_imbalanced(y_test, y_pred))

In [None]:
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head(20)

In [18]:
eec = EasyEnsembleClassifier(n_estimators=100 , random_state=1)
eec = eec.fit(X_train , y_train)

In [29]:
y_pred = eec.predict(X_test)
balanced_accuracy_score(y_test , y_pred)

0.6016918676390409

In [28]:
confusion_matrix(y_test , y_pred)

array([[2759, 2401],
       [1253, 2529]], dtype=int64)

In [27]:
print(classification_report_imbalanced(y_test , y_pred))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.69      0.53      0.67      0.60      0.60      0.35      5160
        1.0       0.51      0.67      0.53      0.58      0.60      0.36      3782

avg / total       0.61      0.59      0.61      0.59      0.60      0.36      8942



In [26]:
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head(20)

Unnamed: 0,Prediction,Actual
0,0.0,0.0
1,0.0,1.0
2,1.0,1.0
3,0.0,0.0
4,0.0,1.0
5,0.0,1.0
6,0.0,0.0
7,0.0,0.0
8,0.0,0.0
9,0.0,0.0


# Resampling 

### Oversampling

#### Naive Random Oversampling

In [48]:
ros = RandomOverSampler(random_state=1)
X_resamp, y_resamp = ros.fit_resample(X_train, y_train)
Counter(y_resamp)

Counter({1.0: 15850, 0.0: 15850})

In [49]:
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resamp, y_resamp)

In [50]:
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.5058568945515067

In [57]:
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head(20)

Unnamed: 0,Prediction,Actual
0,0.0,0.0
1,0.0,1.0
2,1.0,1.0
3,0.0,0.0
4,1.0,1.0
5,1.0,1.0
6,1.0,0.0
7,1.0,0.0
8,1.0,0.0
9,0.0,0.0


In [58]:
matrix = confusion_matrix(y_test, y_pred)

array([[3315, 1845],
       [2190, 1592]], dtype=int64)

In [65]:
matrix = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(
    matrix, index=["Actual Run", "Actual Pass"], columns=["Predicted Run", "Predicted Pass"])
cm_df

## Need to figure out a better display output. Confusing as it reads right now. Im not sure which is Predicted and Actual. 

Unnamed: 0,Predicted Run,Predicted Pass
Actual Run,3315,1845
Actual Pass,2190,1592


In [67]:
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.60      0.64      0.42      0.62      0.52      0.28      5160
        1.0       0.46      0.42      0.64      0.44      0.52      0.26      3782

avg / total       0.54      0.55      0.51      0.55      0.52      0.27      8942



#### SMOTE Oversampling

In [61]:
X_resamp, y_resamp = SMOTE(random_state=1,
sampling_strategy='auto').fit_resample(X_train, y_train)
Counter(y_resamp)

Counter({1.0: 15850, 0.0: 15850})

In [55]:
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resamp, y_resamp)

In [69]:
y_pred_s = y_pred = model.predict(X_test)
acc_score_2 = balanced_accuracy_score(y_test, y_pred_s)
acc_score_2

0.5316915806820558

In [70]:
matrix_sm = confusion_matrix(y_test, y_pred_s)

cm2_df = pd.DataFrame(
    matrix_sm, index=["Actual Run", "Actual Pass"], columns=["Predicted Run", "Predicted Pass"])
cm2_df

Unnamed: 0,Predicted Run,Predicted Pass
Actual Run,3315,1845
Actual Pass,2190,1592


### Undersampling 

In [72]:
cc = ClusterCentroids(random_state=1)
X_resample3, y_resample3 = cc.fit_resample(X_train, y_train)
Counter(y_resample3)

Counter({0.0: 10974, 1.0: 10974})

In [73]:
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resample3, y_resample3)

In [74]:
y_pred_cc = model.predict(X_test)

acc_score3 = balanced_accuracy_score(y_test, y_pred_cc)
acc_score3

0.5305310958887263

In [75]:
matrix_cc = confusion_matrix(y_test, y_pred_cc)

cm3_df = pd.DataFrame(
    matrix_cc, index=["Actual Run", "Actual Pass"], columns=["Predicted Run", "Predicted Pass"])
cm3_df

Unnamed: 0,Predicted Run,Predicted Pass
Actual Run,3258,1902
Actual Pass,2157,1625


In [76]:
print(classification_report_imbalanced(y_test, y_pred_cc))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.60      0.63      0.43      0.62      0.52      0.28      5160
        1.0       0.46      0.43      0.63      0.44      0.52      0.27      3782

avg / total       0.54      0.55      0.51      0.54      0.52      0.27      8942



#### Combination (Over and Under) Sampling

In [78]:
smote_enn = SMOTEENN(random_state=1)
X_resample4, y_resample4 = smote_enn.fit_resample(X, y)
Counter(y_resample4)

Counter({0.0: 4599, 1.0: 5621})

In [79]:
model = LogisticRegression(solver='lbfgs', random_state=1)

model.fit(X_resample4, y_resample4)

In [80]:
y_pred_st = model.predict(X_test)

acc_score4 = balanced_accuracy_score(y_test, y_pred_st)
acc_score4

0.5347264070115889

In [81]:
matrix_st = confusion_matrix(y_test, y_pred_st)

cm4_df = pd.DataFrame(matrix_st, index=["Actual Run", "Actual Pass"], columns=["Predicted Run", "Predicted Pass"])
cm4_df

Unnamed: 0,Predicted Run,Predicted Pass
Actual Run,2570,2590
Actual Pass,1621,2161


In [82]:
print(classification_report_imbalanced(y_test, y_pred_st))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.61      0.50      0.57      0.55      0.53      0.28      5160
        1.0       0.45      0.57      0.50      0.51      0.53      0.29      3782

avg / total       0.55      0.53      0.54      0.53      0.53      0.28      8942

