In [113]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.utils._testing import ignore_warnings
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.decomposition import PCA
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

import torch
import torch.nn as nn
import torch.optim as optim

from skorch import NeuralNetBinaryClassifier

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline


In [114]:
#Load Dataset and Coaching Data
data = pd.read_csv('Data/Final Dataset.csv')

coaching_data = pd.read_csv('Data/Coaching Data.csv', skiprows = [0,1], skipfooter = 202)
coaching_data = coaching_data.rename(columns = {'FBS Team': 'Team'})
coaching_data = coaching_data[['Team','2019','2020','2021','2022','2023','2024']]
coaching_data = coaching_data.melt(id_vars='Team', var_name = 'Season', value_name = 'Coach')
coaching_data['Season'] = coaching_data['Season'].astype('int64')

  data = pd.read_csv('Data/Final Dataset.csv')
  coaching_data = pd.read_csv('Data/Coaching Data.csv', skiprows = [0,1], skipfooter = 202)


In [115]:
#Define Coaching Change Function to Add Coaching Change Column to Data
def coach_change(row, data):

    team = str(row.Team)
    season = int(row.Season)

    try:
    
        curr_coach = data[(data['Team'] == team) & (data['Season'] == season)]['Coach']
        curr_coach = curr_coach[curr_coach.index[0]]
        next_coach = data[(data['Team'] == team) & (data['Season'] == season + 1)]['Coach']
        next_coach = next_coach[next_coach.index[0]]

        if curr_coach != next_coach:
            return 'Yes'
        return 'No'
        
    
    except:
        return 'No'

In [116]:
#Add coaching change info to data
data['Coach Change'] = data.apply(lambda x: coach_change(x, coaching_data), axis = 1)

In [117]:
positions = data.Position.unique()
positions

array(['RB', 'WR', 'QB', 'TE', 'LB', 'DB', 'OL', 'DL', 'CB', 'S', 'PK',
       'LS', 'P', 'DT', 'DE', 'FB', 'C', 'OT', 'G', 'NT', 'ATH', 'OLB',
       '?'], dtype=object)

In [118]:
#Add recruiting info
for i,year in enumerate(['2015','2016','2017','2018', '2019','2020', '2021', '2022', '2023']):
    file = pd.read_csv('Data/Player Recruit Ranking/' + year + '.csv')
    file.rename(columns = {'AthleteId': 'PlayerId', 'Year': 'Class of'}, inplace = True)
    #file.drop(columns = ['Year', 'Rating', 'Ranking'], inplace = True)
    #file.drop(columns = ['Year'], inplace = True)
    data = data.merge(file, on = 'PlayerId', how = 'left', suffixes = [None, '_' + str(i)])

In [119]:
#Combine Columns
for i in range(0,9):
    data['Stars'] = data['Stars'].combine_first(data['Stars_' + str(i)])
    data.drop(columns = ['Stars_' + str(i)], inplace = True)
for i in range(1,9):
    data['Rating'] = data['Rating'].combine_first(data['Rating_' + str(i)])
    data['Ranking'] = data['Ranking'].combine_first(data['Ranking_' + str(i)])
    data['Class of'] = data['Class of'].combine_first(data['Class of_' + str(i)])
    data.drop(columns = ['Rating_' + str(i)], inplace = True)
    data.drop(columns = ['Ranking_' + str(i)], inplace = True)
    data.drop(columns = ['Class of_' + str(i)], inplace = True)

data['Yr'] = data['Season'] - data['Class of'] + 1
data.drop(columns = ['Class of'], inplace = True)

In [120]:
position_groups = {'OL':['OL', 'NT', 'OT', 'G', 'C','FB'],
                   'TE':['TE'],
                   'QB':['QB'],
                   'RB':['RB'],
                   'WR':['WR'],
                   'DL':['DT', 'DE', 'DL'],
                   'DB':['DB', 'CB', 'S'],
                   'LB':['LB'],
                   'ST':['LS', 'P', 'PK']}

In [121]:
#Separate Data into Position Groups
data_sets = {}
for key in position_groups.keys():
    data_sets[key] = data.copy()[data.copy()['Position'].isin(position_groups[key])]

Testing with QB Data

In [122]:
qb = data_sets['QB'].copy().drop(columns = ['PositionId', 'ConferenceId', 'TeamId'])

In [123]:
#Engineer PCT Features
qb['Pct_Team_Pass_Yds'] = qb['YDS']/qb['Team netPassingYards']
qb['Pct_Team_Pass_Attempts'] = qb['ATT']/qb['Team passAttempts']
qb['Pct_Team_Pass_TDs'] = qb['TD']/qb['Team passingTDs']
qb['Pct_Team_Pass_Completions'] = qb['COMPLETIONS']/qb['Team passCompletions']
qb['Pct_Team_Ints'] = qb['INT']/qb['Team passesIntercepted']





In [124]:
#Define Improve Function
def improve(row, column, data):
    try: 
        id = int(row['PlayerId'])
        season = int(row['Season'])
        column = column
        imp = float(data[(data['PlayerId'] == id) & (data['Season'] == season)][column]) > float(data[(data['PlayerId'] == id) & (data['Season'] == season-1)][column])

        if imp:
            return 'Yes' 
        return 'No'
    except:
        return 'Yes'
 

In [125]:
#Engineer Improve Features
feats_to_improve = ['Usage Overall', 'Usage Pass', 'Usage Rush', 'Usage FirstDown','Usage SecondDown', 'Usage ThirdDown', 'Usage StandardDowns',
       'Usage PassingDowns','ATT', 'AVG','COMPLETIONS','INT','LONG','PCT','TD','YDS', 'YPA','Pct_Team_Pass_Yds', 'Pct_Team_Pass_Attempts', 'Pct_Team_Pass_TDs',
       'Pct_Team_Pass_Completions', 'Pct_Team_Ints']

for feat in feats_to_improve:
    qb[feat+'_improve'] = qb.apply(lambda x: improve(x, feat, qb), axis = 1)

  imp = float(data[(data['PlayerId'] == id) & (data['Season'] == season)][column]) > float(data[(data['PlayerId'] == id) & (data['Season'] == season-1)][column])
  imp = float(data[(data['PlayerId'] == id) & (data['Season'] == season)][column]) > float(data[(data['PlayerId'] == id) & (data['Season'] == season-1)][column])
  imp = float(data[(data['PlayerId'] == id) & (data['Season'] == season)][column]) > float(data[(data['PlayerId'] == id) & (data['Season'] == season-1)][column])
  imp = float(data[(data['PlayerId'] == id) & (data['Season'] == season)][column]) > float(data[(data['PlayerId'] == id) & (data['Season'] == season-1)][column])
  imp = float(data[(data['PlayerId'] == id) & (data['Season'] == season)][column]) > float(data[(data['PlayerId'] == id) & (data['Season'] == season-1)][column])
  imp = float(data[(data['PlayerId'] == id) & (data['Season'] == season)][column]) > float(data[(data['PlayerId'] == id) & (data['Season'] == season-1)][column])
  imp = float(data[(data['Pl

In [126]:
#Specify Relevant Columns
#No team stats added as of right now
rel_feats = ['Usage Overall', 'Usage Pass', 'Usage Rush', 'Usage FirstDown','Usage SecondDown', 'Usage ThirdDown', 'Usage StandardDowns',
             'Usage PassingDowns','ATT','COMPLETIONS','INT','LONG','PCT','TD','YDS', 'YPA','Pct_Team_Pass_Yds', 'Pct_Team_Pass_Attempts', 'Pct_Team_Pass_TDs',
             'Pct_Team_Pass_Completions', 'Pct_Team_Ints', 'Usage Overall_improve',
             'Usage Pass_improve', 'Usage Rush_improve', 'Usage FirstDown_improve',
             'Usage SecondDown_improve', 'Usage ThirdDown_improve',
             'Usage StandardDowns_improve', 'Usage PassingDowns_improve',
             'ATT_improve', 'AVG_improve', 'COMPLETIONS_improve', 'INT_improve',
             'LONG_improve', 'PCT_improve', 'TD_improve', 'YDS_improve',
             'YPA_improve', 'Pct_Team_Pass_Yds_improve',
             'Pct_Team_Pass_Attempts_improve', 'Pct_Team_Pass_TDs_improve',
             'Pct_Team_Pass_Completions_improve', 'Pct_Team_Ints_improve', 'Team','Conference','Position','Yr','Stars', 'Coach Change', 'Ranking', 'Rating', 'Transfer_Portal']


#Get rid of 2019, narrow down to relevant features
qb = qb[qb['Season'] != 2019]
qb = qb[rel_feats]
#Encode Label Columns
qb['Transfer_Portal'] = np.where(qb['Transfer_Portal'].values == 'Yes', 1, 0)

#Convert Yr and Stars to Categorical Variables
qb['Yr'] = qb['Yr'].astype('str')
qb['Stars'] = qb['Stars'].astype('str')




In [127]:
#Split into Training and Testing Data
X = qb.drop(columns=['Transfer_Portal'])
y = qb['Transfer_Portal']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.33,random_state=42)

#Specify the Numerical Features and Categorical Features
categorical = ['Usage Overall_improve',
               'Usage Pass_improve', 'Usage Rush_improve', 'Usage FirstDown_improve',
               'Usage SecondDown_improve', 'Usage ThirdDown_improve',
               'Usage StandardDowns_improve', 'Usage PassingDowns_improve',
               'ATT_improve', 'AVG_improve', 'COMPLETIONS_improve', 'INT_improve',
               'LONG_improve', 'PCT_improve', 'TD_improve', 'YDS_improve',
               'YPA_improve', 'Pct_Team_Pass_Yds_improve',
               'Pct_Team_Pass_Attempts_improve', 'Pct_Team_Pass_TDs_improve',
               'Pct_Team_Pass_Completions_improve', 'Pct_Team_Ints_improve', 'Team','Conference','Position','Yr','Stars', 'Coach Change']

numerical = [feat for feat in X.columns if feat not in categorical]

#Pipeline to SimpleImpute and OneHot Encode Categorical Features (Training data only)
impute_encode = Pipeline([('impute',SimpleImputer(strategy='constant',fill_value='N/A')), ('encode',OneHotEncoder(handle_unknown='ignore'))])
column_transform = ColumnTransformer([('cat_encode', impute_encode, categorical), ('numerical_pass', SimpleImputer(strategy='constant',fill_value=0),numerical)])

X_train = column_transform.fit_transform(X_train)

#Fit Pipeline ColumnTransformer to testing features
X_test = column_transform.transform(X_test)

#SMOTE Balancing of Training Data
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train,y_train)



In [128]:
#CV Models w/ Training Data
cv = KFold(n_splits = 5)

def classifiers():
    gb_clf = make_pipeline(StandardScaler(), GradientBoostingClassifier(random_state=42))
    SVM_clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
    forest_clf = RandomForestClassifier(random_state=42)

    gb_f1 = cross_val_score(gb_clf, X_train, y_train, cv=cv, scoring='f1')
    SVM_f1 = cross_val_score(SVM_clf, X_train, y_train, cv=cv, scoring='f1')
    forest_f1 = cross_val_score(forest_clf, X_train, y_train, cv=cv, scoring='f1')

    gb_pre = cross_val_score(gb_clf, X_train, y_train, cv=cv, scoring='precision')
    SVM_pre = cross_val_score(SVM_clf, X_train, y_train, cv=cv, scoring='precision')
    forest_pre = cross_val_score(forest_clf, X_train, y_train, cv=cv, scoring='precision')

    gb_re = cross_val_score(gb_clf, X_train, y_train, cv=cv, scoring='recall')
    SVM_re = cross_val_score(SVM_clf, X_train, y_train, cv=cv, scoring='recall')
    forest_re = cross_val_score(forest_clf, X_train, y_train, cv=cv, scoring='recall')

    print('F1 Scores:')
    print(gb_f1.mean())
    print(SVM_f1.mean())
    print(forest_f1.mean())
    print(' ')
    print('Precision Scores:')
    print(gb_f1.mean())
    print(SVM_pre.mean())
    print(forest_pre.mean())
    print(' ')
    print('Recall Scores:')
    print(gb_re.mean())
    print(SVM_re.mean())
    print(forest_re.mean())

classifiers()





F1 Scores:
0.7467362435940597
0.7837775458956131
0.7475913415001495
 
Precision Scores:
0.7467362435940597
0.7558223866790009
0.7983031674208145
 
Recall Scores:
0.7318692321889996
0.8311387966039128
0.713686323366556


In [129]:
#Evaluate trained model on test data
#Use 'weighted' f1 score, recall, precision since data is imbalanced

clf = GradientBoostingClassifier(random_state=42)

clf.fit(X_train, y_train)
preds = clf.predict(X_test)

print('Base GradientBoostingClassifier Metrics')
print('---------------------------------------')
print('F1 Score: ', f1_score(y_test, preds, average='weighted'))
print('Precision Score: ', precision_score(y_test, preds, average='weighted'))
print('Recall Score: ', recall_score(y_test, preds, average = 'weighted'))
print('Accuracy: ', accuracy_score(y_test, preds))

Base GradientBoostingClassifier Metrics
---------------------------------------
F1 Score:  0.8154114886770417
Precision Score:  0.813318707984453
Recall Score:  0.8327137546468402
Accuracy:  0.8327137546468402


In [130]:
#Evaluate trained model on test data
#Use 'weighted' f1 score, recall, precision since data is imbalanced

clf = RandomForestClassifier(random_state=42, class_weight = 'balanced')

clf.fit(X_train, y_train)
preds = clf.predict(X_test)

print('Base RandomForestClassifier Metrics')
print('-----------------------------------')
print('F1 Score: ', f1_score(y_test, preds, average='weighted'))
print('Precision Score: ', precision_score(y_test, preds, average='weighted'))
print('Recall Score: ', recall_score(y_test, preds, average = 'weighted'))
print('Accuracy: ', accuracy_score(y_test, preds))

Base RandomForestClassifier Metrics
-----------------------------------
F1 Score:  0.793678481782571
Precision Score:  0.8070074102598281
Recall Score:  0.828996282527881
Accuracy:  0.828996282527881


In [131]:
#GridSearch/Optimize/Hyperparameter Tune