In [2]:
# Initial Imports

%matplotlib inline
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
from sklearn.linear_model import LinearRegression
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import ClusterCentroids
from imblearn.combine import SMOTEENN
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.ensemble import EasyEnsembleClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_blobs

In [None]:
# Load the data
file_path = Path('nfl_data_2020.csv')
nfl_ml_df = pd.read_csv(file_path)

In [None]:
# Load the data from SQL
import sqlalchemy as db
engine = db.create_engine('INSERT DATABASE LINK HERE')
connection = engine.connect()
metadata = db.MetaData()
nfl_ml_df = db.Table('TABLE NAME HERE', metadata, autoload=True, autoload_with=engine)

In [None]:
columns = [
"passing_yards"
"passing_touchdowns"
"interceptions_thrown"    
"rush_yards"
"rush_touchdowns"
"receptions"
"receiving_yards"
"receiving_touchdowns"
"fumbles_lost"
"kickoff_return_touchdown"
"punt_return_touchdown"
"position"
"fantasy_points"
]

target = ["fantasy_points"]

In [None]:
nfl_2020_df

## Split data into train and test

In [None]:
# Create our features
X = nfl_ml_df.copy() 
X = X.drop('fantasy_points', axis=1)

# Create our target
y = nfl_ml_df['fantasy_points']

In [None]:
X.describe()

In [None]:
# Check the balance of our target values
y.value_counts()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

## Naive Random Oversampling

In [None]:
# Resample the training data with the RandomOversampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

Counter(y_resampled)

In [None]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

In [None]:
# Calculate the balanced accuracy score
balanced_accuracy_score(y_test, y_pred)

In [None]:
# Display the confusion matrix
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

In [None]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

## SMOTE Oversampling

In [None]:
# Resample the training data with SMOTE
X_resampled, y_resampled = SMOTE(random_state=1, sampling_strategy='auto').fit_resample(
    X_train, y_train
)
Counter(y_resampled)

In [None]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

In [None]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

In [None]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

In [None]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

## Undersampling

In [None]:
# Resample the data using the ClusterCentroids resampler
cc = ClusterCentroids(random_state=1)
X_resampled, y_resampled = cc.fit_resample(X_train, y_train)
Counter(y_resampled)

In [None]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=78)
model.fit(X_resampled, y_resampled)

In [None]:
# Calculate the balanced accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

In [None]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

In [None]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

## Combination Sampling

In [None]:
# Resample the training data with SMOTEENN
smote_enn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)

In [None]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

In [None]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

In [None]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

In [None]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

## Balanced Random Forest Classifier

In [None]:
# Creating a StandardScaler instance.
scaler = StandardScaler()

# Fitting the Standard Scaler with the training data.
X_scaler = scalerfit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
# Resample the training data with the BalancedRandomForestClassifier
rfc_model = BalancedRandomForestClassifier(n_estimators=100, random_state=1)

In [None]:
# Fitting the model
rfc_model = rfc_model.fit(X_train_scaled, y_train)

In [None]:
y_pred = rfc_model.predict(X_test_scaled)

In [None]:
# Calculate the balanced accuracy score
balanced_accuracy_score(y_test, y_pred)

In [None]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

In [None]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

In [None]:
# Calculate feature importance in the Random Forest model
feature_importance = rfc_model.feature_importances_
feature_importance

In [None]:
# List the features sorted in descending order by feature importance
sorted(zip(rfc_model.feature_importances_, X.columns), reverse=True)

## Easy Ensemble with AdaBoost

In [None]:
# Train the EasyEnsembleClassifier
eec = EasyEnsembleClassifier(n_estimators=100, random_state=1)
eec.fit(X_train, y_train) 

In [None]:
# Calculated the balanced accuracy score
y_pred = eec.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

In [None]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

In [None]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))