In [32]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.utils._testing import ignore_warnings
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.decomposition import PCA
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression



import torch
import torch.nn as nn
import torch.optim as optim

from skorch import NeuralNetBinaryClassifier

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

from  clf_utils import grid_cv_model



In [2]:
#Load Dataset and Coaching Data
data = pd.read_csv('../Data/Final Dataset.csv')

coaching_data = pd.read_csv('../Data/Coaching Data.csv', skiprows = [0,1], skipfooter = 202)
coaching_data = coaching_data.rename(columns = {'FBS Team': 'Team'})
coaching_data = coaching_data[['Team','2019','2020','2021','2022','2023','2024']]
coaching_data = coaching_data.melt(id_vars='Team', var_name = 'Season', value_name = 'Coach')
coaching_data['Season'] = coaching_data['Season'].astype('int64')

  data = pd.read_csv('Data/Final Dataset.csv')
  coaching_data = pd.read_csv('Data/Coaching Data.csv', skiprows = [0,1], skipfooter = 202)


In [3]:
#Define Coaching Change Function to Add Coaching Change Column to Data
def coach_change(row, data):

    team = str(row.Team)
    season = int(row.Season)

    try:
    
        curr_coach = data[(data['Team'] == team) & (data['Season'] == season)]['Coach']
        curr_coach = curr_coach[curr_coach.index[0]]
        next_coach = data[(data['Team'] == team) & (data['Season'] == season + 1)]['Coach']
        next_coach = next_coach[next_coach.index[0]]

        if curr_coach != next_coach:
            return 'Yes'
        return 'No'
        
    
    except:
        return 'N/A'

In [4]:
#Add coaching change info to data
data['Coach Change'] = data.apply(lambda x: coach_change(x, coaching_data), axis = 1)

In [5]:
positions = data.Position.unique()
positions

array(['RB', 'WR', 'QB', 'TE', 'LB', 'DB', 'OL', 'DL', 'CB', 'S', 'PK',
       'LS', 'P', 'DT', 'DE', 'FB', 'C', 'OT', 'G', 'NT', 'ATH', 'OLB',
       '?'], dtype=object)

In [6]:
#Add recruiting info
for i,year in enumerate(['2015','2016','2017','2018', '2019','2020', '2021', '2022', '2023']):
    file = pd.read_csv('../Data/Player Recruit Ranking/' + year + '.csv')
    file.rename(columns = {'AthleteId': 'PlayerId', 'Year': 'Class of'}, inplace = True)
    #file.drop(columns = ['Year', 'Rating', 'Ranking'], inplace = True)
    #file.drop(columns = ['Year'], inplace = True)
    data = data.merge(file, on = 'PlayerId', how = 'left', suffixes = [None, '_' + str(i)])

In [7]:
#Combine Columns
for i in range(0,9):
    data['Stars'] = data['Stars'].combine_first(data['Stars_' + str(i)])
    data.drop(columns = ['Stars_' + str(i)], inplace = True)
for i in range(1,9):
    data['Rating'] = data['Rating'].combine_first(data['Rating_' + str(i)])
    data['Ranking'] = data['Ranking'].combine_first(data['Ranking_' + str(i)])
    data['Class of'] = data['Class of'].combine_first(data['Class of_' + str(i)])
    data.drop(columns = ['Rating_' + str(i)], inplace = True)
    data.drop(columns = ['Ranking_' + str(i)], inplace = True)
    data.drop(columns = ['Class of_' + str(i)], inplace = True)

data['Yr'] = data['Season'] - data['Class of'] + 1
data.drop(columns = ['Class of'], inplace = True)

In [8]:
position_groups = {'OL':['OL', 'NT', 'OT', 'G', 'C','FB'],
                   'TE':['TE'],
                   'QB':['QB'],
                   'RB':['RB'],
                   'WR':['WR'],
                   'DL':['DT', 'DE', 'DL'],
                   'DB':['DB', 'CB', 'S'],
                   'LB':['LB'],
                   'ST':['LS', 'P', 'PK']}

In [9]:
#Separate Data into Position Groups
data_sets = {}
for key in position_groups.keys():
    data_sets[key] = data.copy()[data.copy()['Position'].isin(position_groups[key])]

Testing with QB Data


In [39]:
qb = data_sets['QB'].copy().drop(columns = ['PositionId', 'ConferenceId', 'TeamId'])
qb['Stars'] = qb['Stars'].fillna(0)
qb['Yr'] = qb['Yr'].fillna(0)

In [40]:
#Engineer PCT Features
qb['Pct_Team_Pass_Yds'] = qb['YDS']/qb['Team netPassingYards']
qb['Pct_Team_Pass_Attempts'] = qb['ATT']/qb['Team passAttempts']
qb['Pct_Team_Pass_TDs'] = qb['TD']/qb['Team passingTDs']
qb['Pct_Team_Pass_Completions'] = qb['COMPLETIONS']/qb['Team passCompletions']
qb['Pct_Team_Ints'] = qb['INT']/qb['Team passesIntercepted']

In [41]:
#Define Improve Function
def improve(row, column, data):
    try: 
        id = int(row['PlayerId'])
        season = int(row['Season'])
        column = column
        imp = float(data[(data['PlayerId'] == id) & (data['Season'] == season)][column]) > float(data[(data['PlayerId'] == id) & (data['Season'] == season-1)][column])

        if imp:
            return 1 
        return -1
    except:
        return 0
 

In [42]:
#Engineer Improve Features
feats_to_improve = ['Usage Overall', 'Usage Pass', 'Usage Rush', 'Usage FirstDown','Usage SecondDown', 'Usage ThirdDown', 'Usage StandardDowns',
       'Usage PassingDowns','ATT', 'AVG','COMPLETIONS','INT','LONG','PCT','TD','YDS', 'YPA','Pct_Team_Pass_Yds', 'Pct_Team_Pass_Attempts', 'Pct_Team_Pass_TDs',
       'Pct_Team_Pass_Completions', 'Pct_Team_Ints']

for feat in feats_to_improve:
    qb[feat+'_improve'] = qb.apply(lambda x: improve(x, feat, qb), axis = 1)

  imp = float(data[(data['PlayerId'] == id) & (data['Season'] == season)][column]) > float(data[(data['PlayerId'] == id) & (data['Season'] == season-1)][column])
  imp = float(data[(data['PlayerId'] == id) & (data['Season'] == season)][column]) > float(data[(data['PlayerId'] == id) & (data['Season'] == season-1)][column])
  imp = float(data[(data['PlayerId'] == id) & (data['Season'] == season)][column]) > float(data[(data['PlayerId'] == id) & (data['Season'] == season-1)][column])
  imp = float(data[(data['PlayerId'] == id) & (data['Season'] == season)][column]) > float(data[(data['PlayerId'] == id) & (data['Season'] == season-1)][column])
  imp = float(data[(data['PlayerId'] == id) & (data['Season'] == season)][column]) > float(data[(data['PlayerId'] == id) & (data['Season'] == season-1)][column])
  imp = float(data[(data['PlayerId'] == id) & (data['Season'] == season)][column]) > float(data[(data['PlayerId'] == id) & (data['Season'] == season-1)][column])
  imp = float(data[(data['Pl

In [43]:
#Function to compare player stats with players of same year and star ranking
def compare(row, column, data):
    star = int(row['Stars'])
    year = int(row['Yr'])
    stat = column
    pos = str(row['Position'])
    id = int(row['PlayerId'])
    season = int(row['Season'])

    mean = data[(data['Stars'] == star) & (data['Yr'] == year) & (data['Position'] == pos)][stat].mean()
    p_stat = data[(data['PlayerId'] == id)&(data['Season'] == season)][stat]
    p_stat = p_stat[p_stat.index[0]]

    if int(p_stat>mean):
        return 1
    elif int(p_stat<mean):
        return -1
    else:
        return 0

In [44]:
feats_to_compare = ['Usage Overall', 'Usage Pass', 'Usage Rush', 'Usage FirstDown','Usage SecondDown', 'Usage ThirdDown', 'Usage StandardDowns',
       'Usage PassingDowns','ATT', 'AVG','COMPLETIONS','INT','LONG','PCT','TD','YDS', 'YPA','Pct_Team_Pass_Yds', 'Pct_Team_Pass_Attempts', 'Pct_Team_Pass_TDs',
       'Pct_Team_Pass_Completions', 'Pct_Team_Ints']
for feat in feats_to_compare:
    qb[feat + '_compare'] = qb.apply(lambda x: compare(x, feat, qb), axis = 1)

In [45]:
#Specify Relevant Columns
#No team stats added as of right now
rel_feats = ['Usage Overall', 'Usage Pass', 'Usage Rush', 'Usage FirstDown','Usage SecondDown', 'Usage ThirdDown', 'Usage StandardDowns',
             'Usage PassingDowns','ATT','COMPLETIONS','INT','LONG','PCT','TD','YDS', 'YPA','Pct_Team_Pass_Yds', 'Pct_Team_Pass_Attempts', 'Pct_Team_Pass_TDs',
             'Pct_Team_Pass_Completions', 'Pct_Team_Ints', 'Usage Overall_improve',
             'Usage Pass_improve', 'Usage Rush_improve', 'Usage FirstDown_improve',
             'Usage SecondDown_improve', 'Usage ThirdDown_improve',
             'Usage StandardDowns_improve', 'Usage PassingDowns_improve',
             'ATT_improve', 'AVG_improve', 'COMPLETIONS_improve', 'INT_improve',
             'LONG_improve', 'PCT_improve', 'TD_improve', 'YDS_improve',
             'YPA_improve', 'Pct_Team_Pass_Yds_improve',
             'Pct_Team_Pass_Attempts_improve', 'Pct_Team_Pass_TDs_improve',
             'Pct_Team_Pass_Completions_improve', 'Pct_Team_Ints_improve', 'Usage Overall_compare', 'Usage Pass_compare', 'Usage Rush_compare',
             'Usage FirstDown_compare', 'Usage SecondDown_compare',
             'Usage ThirdDown_compare', 'Usage StandardDowns_compare',
             'Usage PassingDowns_compare', 'ATT_compare', 'AVG_compare',
             'COMPLETIONS_compare', 'INT_compare', 'LONG_compare', 'PCT_compare',
             'TD_compare', 'YDS_compare', 'YPA_compare', 'Pct_Team_Pass_Yds_compare',
             'Pct_Team_Pass_Attempts_compare', 'Pct_Team_Pass_TDs_compare',
             'Pct_Team_Pass_Completions_compare', 'Pct_Team_Ints_compare','Position','Yr','Stars', 'Coach Change', 'Ranking', 'Rating', 'Transfer_Portal']


#Get rid of 2019, narrow down to relevant features
qb = qb[qb['Season'] != 2019]
qb = qb[rel_feats]
#Encode Label Columns
qb['Transfer_Portal'] = np.where(qb['Transfer_Portal'].values == 'Yes', 1, 0)

#Convert Yr and Stars to Categorical Variables
qb['Yr'] = qb['Yr'].astype('str')
qb['Stars'] = qb['Stars'].astype('str')




In [46]:
#Split into Training and Testing Data
X = qb.drop(columns=['Transfer_Portal'])
y = qb['Transfer_Portal']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.1,random_state=42, stratify=y)

#Specify the Numerical Features and Categorical Features
categorical = ['Position','Yr','Stars', 'Coach Change']

numerical = [feat for feat in X.columns if feat not in categorical]

#Pipeline to SimpleImpute and OneHot Encode Categorical Features (Training data only)
impute_encode = Pipeline([('impute',SimpleImputer(strategy='constant',fill_value='N/A')), ('encode',OneHotEncoder(handle_unknown='infrequent_if_exist'))])
column_transform = ColumnTransformer([('cat_encode', impute_encode, categorical), ('numerical_pass', SimpleImputer(strategy='constant',fill_value=0),numerical)])

X_train = column_transform.fit_transform(X_train)

#Fit Pipeline ColumnTransformer to testing features
X_test = column_transform.transform(X_test)

#SMOTE Balancing of Training Data
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train,y_train)



In [47]:
#CV Models w/ Training Data
cv = KFold(n_splits = 5)

def classifiers():
    gb_clf = make_pipeline(StandardScaler(), GradientBoostingClassifier(random_state=42))
    SVM_clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
    forest_clf = RandomForestClassifier(random_state=42)

    gb_f1 = cross_val_score(gb_clf, X_train, y_train, cv=cv, scoring='f1')
    SVM_f1 = cross_val_score(SVM_clf, X_train, y_train, cv=cv, scoring='f1')
    forest_f1 = cross_val_score(forest_clf, X_train, y_train, cv=cv, scoring='f1')

    gb_pre = cross_val_score(gb_clf, X_train, y_train, cv=cv, scoring='precision')
    SVM_pre = cross_val_score(SVM_clf, X_train, y_train, cv=cv, scoring='precision')
    forest_pre = cross_val_score(forest_clf, X_train, y_train, cv=cv, scoring='precision')

    gb_re = cross_val_score(gb_clf, X_train, y_train, cv=cv, scoring='recall')
    SVM_re = cross_val_score(SVM_clf, X_train, y_train, cv=cv, scoring='recall')
    forest_re = cross_val_score(forest_clf, X_train, y_train, cv=cv, scoring='recall')

    print('F1 Scores:')
    print(gb_f1.mean())
    print(SVM_f1.mean())
    print(forest_f1.mean())
    print(' ')
    print('Precision Scores:')
    print(gb_f1.mean())
    print(SVM_pre.mean())
    print(forest_pre.mean())
    print(' ')
    print('Recall Scores:')
    print(gb_re.mean())
    print(SVM_re.mean())
    print(forest_re.mean())

classifiers()





F1 Scores:
0.7593032877753639
0.7628520865165949
0.7723269712743398
 
Precision Scores:
0.7593032877753639
0.7206457886457887
0.8265814461876637
 
Recall Scores:
0.7371715168363097
0.8428450452327795
0.7361366149887059


Comparing Base Models


In [48]:
#Dummy Classifier:

dummy = make_pipeline(StandardScaler(), DummyClassifier(random_state=42, strategy='stratified'))

dummy.fit(X_train, y_train)
preds = dummy.predict(X_test)


print('DummmyClassifier Metrics')
print('-----------------------------------')
print('F1 Score: ', f1_score(y_test, preds))
print('Precision Score: ', precision_score(y_test, preds))
print('Recall Score: ', recall_score(y_test, preds))
print('Accuracy: ', accuracy_score(y_test, preds))

DummmyClassifier Metrics
-----------------------------------
F1 Score:  0.3278688524590164
Precision Score:  0.23255813953488372
Recall Score:  0.5555555555555556
Accuracy:  0.5


In [49]:
#Evaluate trained model on test data

clf = make_pipeline(StandardScaler(),GradientBoostingClassifier(random_state=42))

clf.fit(X_train, y_train)
preds = clf.predict(X_test)

print('Base GradientBoostingClassifier Metrics')
print('---------------------------------------')
print('F1 Score: ', f1_score(y_test, preds))
print('Precision Score: ', precision_score(y_test, preds))
print('Recall Score: ', recall_score(y_test, preds))
print('Accuracy: ', accuracy_score(y_test, preds))

Base GradientBoostingClassifier Metrics
---------------------------------------
F1 Score:  0.5555555555555556
Precision Score:  0.5555555555555556
Recall Score:  0.5555555555555556
Accuracy:  0.8048780487804879


In [50]:
#Evaluate trained model on test data

clf = make_pipeline(StandardScaler(), RandomForestClassifier(random_state=42, class_weight = 'balanced'))

clf.fit(X_train, y_train)
preds = clf.predict(X_test)

print('Base RandomForestClassifier Metrics')
print('-----------------------------------')
print('F1 Score: ', f1_score(y_test, preds))
print('Precision Score: ', precision_score(y_test, preds))
print('Recall Score: ', recall_score(y_test, preds,))
print('Accuracy: ', accuracy_score(y_test, preds))

Base RandomForestClassifier Metrics
-----------------------------------
F1 Score:  0.43750000000000006
Precision Score:  0.5
Recall Score:  0.3888888888888889
Accuracy:  0.7804878048780488


In [51]:
clf = make_pipeline(StandardScaler(),LogisticRegression(random_state=42, class_weight='balanced'))

clf.fit(X_train, y_train)
preds = clf.predict(X_test)

print('Base LogisticRegression Metrics')
print('-----------------------------------')
print('F1 Score: ', f1_score(y_test, preds))
print('Precision Score: ', precision_score(y_test, preds))
print('Recall Score: ', recall_score(y_test, preds))
print('Accuracy: ', accuracy_score(y_test, preds))

Base LogisticRegression Metrics
-----------------------------------
F1 Score:  0.5217391304347826
Precision Score:  0.42857142857142855
Recall Score:  0.6666666666666666
Accuracy:  0.7317073170731707


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Hyperparameter Tuning to Identify Best Performing Model


In [52]:
#GB Hyperparameter Tune
RANDOM_STATE = 42
gb_clf = grid_cv_model(
     X=X_train,
     y=y_train,
     model= GradientBoostingClassifier(),
     params={
         "random_state": [RANDOM_STATE],
         "learning_rate": [0.01, 0.015, 0.02,0.05, 0.075, 0.1],
         "n_estimators": np.arange(1,100,1),
         "max_features":['sqrt', 'log2', None]
     },
     cv=5,
     scoring='recall'
 )

In [53]:
best_gb = gb_clf.best_estimator_
preds =  best_gb.predict(X_test)

print('Best GradientBoostingClassifier Metrics')
print('---------------------------------------')
print('Best Model:', best_gb)
print('F1 Score: ', f1_score(y_test, preds))
print('Precision Score: ', precision_score(y_test, preds))
print('Recall Score: ', recall_score(y_test, preds))
print('Accuracy: ', accuracy_score(y_test, preds))

Best GradientBoostingClassifier Metrics
---------------------------------------
Best Model: Pipeline(steps=[('scaler', StandardScaler()),
                ('model',
                 GradientBoostingClassifier(learning_rate=0.075,
                                            n_estimators=70,
                                            random_state=42))])
F1 Score:  0.5142857142857143
Precision Score:  0.5294117647058824
Recall Score:  0.5
Accuracy:  0.7926829268292683


In [54]:
#RF Hyperparameter Tune
RANDOM_STATE = 42
rf_clf = grid_cv_model(
     X=X_train,
     y=y_train,
     model= RandomForestClassifier(class_weight = 'balanced'),
     params={
         "random_state": [RANDOM_STATE],
         "n_estimators": np.arange(1,100,1),
         "max_features":['sqrt', 'log2', None], 
         "criterion": ['gini', 'entropy', 'log_loss']
     },
     cv=5,
     scoring='recall'
 )

In [55]:
best_rf = rf_clf.best_estimator_
preds =  best_rf.predict(X_test)

print('Best RandomForestClassifier Metrics')
print('---------------------------------------')
print('Best Model:', best_rf)
print('F1 Score: ', f1_score(y_test, preds))
print('Precision Score: ', precision_score(y_test, preds))
print('Recall Score: ', recall_score(y_test, preds))
print('Accuracy: ', accuracy_score(y_test, preds))

Best RandomForestClassifier Metrics
---------------------------------------
Best Model: Pipeline(steps=[('scaler', StandardScaler()),
                ('model',
                 RandomForestClassifier(class_weight='balanced',
                                        criterion='entropy', max_features=None,
                                        n_estimators=11, random_state=42))])
F1 Score:  0.6857142857142857
Precision Score:  0.7058823529411765
Recall Score:  0.6666666666666666
Accuracy:  0.8658536585365854


In [56]:
#LR Hyperparameter Tune
RANDOM_STATE = 42
lr_clf = grid_cv_model(
     X=X_train,
     y=y_train,
     model= LogisticRegression(class_weight='balanced'),
     params={
         "random_state": [RANDOM_STATE],
         "penalty": ['l2'],
         "solver": ['lbfgs', 'liblinear'], 
         "C": [1, 5, 10, 100, 1000], 
         "max_iter": np.arange(10000, 11000, 100)
     },
     cv=5,
     scoring='recall'
 )

In [57]:
best_lr = lr_clf.best_estimator_
preds = best_lr.predict(X_test)

print('Best LogisticRegression Metrics')
print('---------------------------------------')
print('Best Model:', best_lr)
print('F1 Score: ', f1_score(y_test, preds))
print('Precision Score: ', precision_score(y_test, preds))
print('Recall Score: ', recall_score(y_test, preds))
print('Accuracy: ', accuracy_score(y_test, preds))

Best LogisticRegression Metrics
---------------------------------------
Best Model: Pipeline(steps=[('scaler', StandardScaler()),
                ('model',
                 LogisticRegression(C=100, class_weight='balanced',
                                    max_iter=10000, random_state=42))])
F1 Score:  0.5217391304347826
Precision Score:  0.42857142857142855
Recall Score:  0.6666666666666666
Accuracy:  0.7317073170731707


In [58]:
from joblib import dump, load
dump(best_rf, 'qb_classifier.joblib')

['qb_classifier.joblib']