In [132]:
import pandas as pd
import numpy as np
from pathlib import Path
from sqlalchemy import create_engine
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import explained_variance_score
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestClassifier

In [98]:
# Load the data
file_path = Path('team_cautious_waffle-JTbranch/nfl_2020_player_stats.csv')
nfl_ml_df = pd.read_csv(file_path)
nfl_ml_df.set_index("name", inplace=True)

In [133]:
from config import db_password

In [135]:
# Create the connection to the PostgreSQL database
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/nfl_2020_player_stats"
engine = create_engine(db_string)
# movies_df.to_sql(name='movies', con=engine, if_exists='replace')
nfl_ml_df = pd.read_sql_table('nfl2020', con=engine, index_col='name')

In [136]:
nfl_ml_df['quarterback_rating'] = nfl_ml_df['quarterback_rating'].astype('int64')
nfl_ml_df['fantasy_points'] = nfl_ml_df['fantasy_points'].astype('int64')

In [137]:
nfl_ml_df = pd.get_dummies(nfl_ml_df)

In [138]:
nfl_ml_df['position_QB'] = nfl_ml_df['position_QB'].astype('int64')
nfl_ml_df['position_RB'] = nfl_ml_df['position_RB'].astype('int64')
nfl_ml_df['position_TE'] = nfl_ml_df['position_TE'].astype('int64')
nfl_ml_df['position_WR'] = nfl_ml_df['position_WR'].astype('int64')
nfl_ml_df.dtypes

completed_passes                   int64
attempted_passes                   int64
passing_yards                      int64
passing_touchdowns                 int64
interceptions_thrown               int64
times_sacked                       int64
yards_lost_from_sacks              int64
longest_pass                       int64
quarterback_rating                 int64
rush_attempts                      int64
rush_yards                         int64
rush_touchdowns                    int64
longest_rush                       int64
times_pass_target                  int64
receptions                         int64
receiving_yards                    int64
receiving_touchdowns               int64
longest_reception                  int64
fumbles                            int64
fumbles_lost                       int64
fumbles_recovered_for_touchdown    int64
kickoff_return_touchdown           int64
punt_return_touchdown              int64
fantasy_points                     int64
position_QB     

## Split data into features and target

In [107]:
# Create our features
X = nfl_ml_df.copy() 
X = X.drop(['fantasy_points'], axis=1)

# Create our target
y = nfl_ml_df['fantasy_points']

X.head()

Unnamed: 0_level_0,completed_passes,attempted_passes,passing_yards,passing_touchdowns,interceptions_thrown,times_sacked,yards_lost_from_sacks,longest_pass,quarterback_rating,rush_attempts,...,longest_reception,fumbles,fumbles_lost,fumbles_recovered_for_touchdown,kickoff_return_touchdown,punt_return_touchdown,position_QB,position_RB,position_TE,position_WR
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Kirk Cousins,19,25,259,2,1,2,11,37,118,4,...,0,0,0,0,0,0,1,0,0,0
Alexander Mattison,0,0,0,0,0,0,0,0,0,6,...,9,0,0,0,0,0,0,1,0,0
Dalvin Cook,0,0,0,0,0,0,0,0,0,12,...,-2,0,0,0,0,0,0,1,0,0
Adam Thielen,0,0,0,0,0,0,0,0,0,0,...,37,0,0,0,0,0,0,0,0,1
Olabisi Johnson,0,0,0,0,0,0,0,0,0,0,...,29,0,0,0,0,0,0,0,0,1


In [108]:
# Check the balance of our target values
y.value_counts()

 0      378
 4      259
 3      258
 2      247
 6      234
       ... 
 49       1
-23       1
-19       1
 44       1
-113      1
Name: fantasy_points, Length: 69, dtype: int64

In [109]:
# Split into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

# Determine the shape of our training and testing sets.
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(3268, 27)
(1090, 27)
(3268,)
(1090,)


## Random Forest Classifier to determine feature importance

In [119]:
# Creating a StandardScaler instance
scaler = StandardScaler()

# Fitting the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [120]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 

In [121]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [122]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)
predictions

array([ 1,  3, 17, ...,  3,  5,  8], dtype=int64)

In [123]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([0.01564198, 0.01651364, 0.02138519, 0.00973304, 0.0045933 ,
       0.00755634, 0.01072716, 0.01394075, 0.01618545, 0.06101261,
       0.10448542, 0.02284064, 0.07065686, 0.07105025, 0.11935875,
       0.20179678, 0.04319742, 0.12732687, 0.01578277, 0.01167218,
       0.00301233, 0.00063492, 0.00254834, 0.00112849, 0.0083551 ,
       0.00714092, 0.0117225 ])

In [124]:
# Features sorted by their importance
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.20179677913986063, 'receiving_yards'),
 (0.12732687029596101, 'longest_reception'),
 (0.11935874821286535, 'receptions'),
 (0.10448542421076477, 'rush_yards'),
 (0.07105024537927623, 'times_pass_target'),
 (0.07065685827507188, 'longest_rush'),
 (0.06101261012596906, 'rush_attempts'),
 (0.04319742426699275, 'receiving_touchdowns'),
 (0.02284064400707, 'rush_touchdowns'),
 (0.021385185838941303, 'passing_yards'),
 (0.0165136410164568, 'attempted_passes'),
 (0.016185450495152763, 'quarterback_rating'),
 (0.01578276729320897, 'fumbles'),
 (0.01564198098013822, 'completed_passes'),
 (0.013940747728846832, 'longest_pass'),
 (0.01172250479082299, 'position_WR'),
 (0.011672178146155485, 'fumbles_lost'),
 (0.010727158061818563, 'yards_lost_from_sacks'),
 (0.009733040334720491, 'passing_touchdowns'),
 (0.008355103796766449, 'position_RB'),
 (0.007556336118190117, 'times_sacked'),
 (0.007140918919521495, 'position_TE'),
 (0.00459330051371931, 'interceptions_thrown'),
 (0.0030123304745439494,

## Linear Regression Model

In [125]:
model = LinearRegression()
model.fit(X, y)

LinearRegression()

In [126]:
y_pred = model.predict(X_test)

In [127]:
explained_variance_score(y_test, y_pred)

0.9973049035813789

In [128]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.9973041962336178