In [None]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np 
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from imblearn.metrics import classification_report_imbalanced
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.ensemble import EasyEnsembleClassifier
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import ClusterCentroids
from imblearn.combine import SMOTEENN

In [None]:
fp = Path('Resources/clean_nfl_data.csv')

df_2 = pd.read_csv(fp)

df_2 = pd.DataFrame(df_2)

df_2

In [None]:
df_2['play_type'] = df_2['play_type'].astype(str).str.replace('run' , '1')
df_2['play_type'] = df_2['play_type'].astype(str).str.replace('pass' , '0')
df_2['play_type'] = df_2['play_type'].astype('float')
df_2

In [None]:

print(df_2.columns.tolist())


In [None]:
df_2 = df_2[['home_team' , 'away_team' , 'posteam' , 'down' , 'ydstogo' , 'play_type']].convert_dtypes()


In [None]:
df_2 = df_2[df_2.down != 0 ]

In [None]:
df_2['down'].isnull().sum()

In [None]:

df_2 = df_2[['down' , 'play_type' , 'ydstogo']].convert_dtypes()
df_2

In [None]:
df_2 = df_2.dropna(axis = 'columns' , how='any')

df_2 = df_2.dropna()

df_2


In [None]:
x_2 = {'down_1': '1' , 'down_2': '2' , 'down_3': '3' , 'down_4': '4'}
df_2 = df_2.replace(x_2)


In [None]:

df_2.reset_index(inplace=True, drop=True)

df_2.head(10)


In [None]:

X_2 = df_2.drop('play_type' , axis=1)

X_2 = pd.get_dummies(X_2)

# Create our target
y_2 = df_2['play_type'].astype('int')

In [None]:
X_2.describe()

In [None]:
y_2.value_counts()


In [None]:
X_2_train, X_2_test, y_2_train, y_2_test = train_test_split(X_2, y_2, random_state=1)
X_2_train.shape

In [None]:
brfc = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
brfc.fit(X_2_train, y_2_train)
y_2_pred = brfc.predict(X_2_test)

In [None]:
y_2_pred = brfc.predict(X_2_test)

balanced_accuracy_score(y_2_test , y_2_pred)

In [None]:
balanced_accuracy_score(y_2_test , y_2_pred)
confusion_matrix(y_2_test, y_2_pred)
print(classification_report_imbalanced(y_2_test, y_2_pred))


In [None]:
results_2 = pd.DataFrame({"Prediction": y_2_pred, "Actual": y_2_test}).reset_index(drop=True)
results_2.head(20)

In [None]:
eec = EasyEnsembleClassifier(n_estimators=100 , random_state=1)
eec = eec.fit(X_2_train , y_2_train)
y_2_pred = eec.predict(X_2_test)


In [None]:
balanced_accuracy_score(y_2_test , y_2_pred)


In [None]:
confusion_matrix(y_2_test , y_2_pred)
print(classification_report_imbalanced(y_2_test , y_2_pred))


In [None]:
results = pd.DataFrame({"Prediction": y_2_pred, "Actual": y_2_test}).reset_index(drop=True)
results.head(20)

In [None]:
ros = RandomOverSampler(random_state=1)
X_2_resamp, y_2_resamp = ros.fit_resample(X_2_train, y_2_train)
Counter(y_2_resamp)

In [None]:
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_2_resamp, y_2_resamp)

In [None]:
y_2_pred = model.predict(X_2_test)
balanced_accuracy_score(y_2_test, y_2_pred)

In [None]:
results = pd.DataFrame({"Prediction": y_2_pred, "Actual": y_2_test}).reset_index(drop=True)
results.head(20)

In [None]:
matrix = confusion_matrix(y_2_test, y_2_pred)


In [None]:
matrix = confusion_matrix(y_2_test, y_2_pred)
cm_df = pd.DataFrame(
    matrix, index = ['Pass' , 'Run'] , columns=['Actual' , 'Predicted'])
cm_df

## Need to figure out a better display output. Confusing as it reads right now. Im not sure which is Predicted and Actual. 

In [None]:
print(classification_report_imbalanced(y_2_test, y_2_pred))

#### Smote Oversampling 

In [None]:
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_2_resamp, y_2_resamp)

In [None]:
y_2_pred_s = y_2_pred = model.predict(X_2_test)
acc_score_2 = balanced_accuracy_score(y_2_test, y_2_pred_s)
acc_score_2

In [None]:
matrix_sm = confusion_matrix(y_2_test, y_2_pred_s)

cm2_df = pd.DataFrame(matrix_sm, index = ['Pass' , 'Run'], columns=['Actual' , 'Predicted'])

cm2_df


#### Undersampling

In [None]:
cc = ClusterCentroids(random_state=1)
X_2_resample3, y_2_resample3 = cc.fit_resample(X_2_train, y_2_train)
Counter(y_2_resample3)


In [None]:
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_2_resample3, y_2_resample3)
y_2_pred_cc = model.predict(X_2_test)


In [None]:

acc_score3 = balanced_accuracy_score(y_2_test, y_2_pred_cc)
acc_score3


In [None]:
matrix_cc = confusion_matrix(y_2_test, y_2_pred_cc)


In [None]:

cm3_df = pd.DataFrame(
    matrix_cc, index=["Actual Run", "Actual Pass"], columns=["Predicted Run", "Predicted Pass"])
cm3_df
print(classification_report_imbalanced(y_2_test, y_2_pred_cc))
#### Combination (Over and Under) Sampling