In [23]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression

from imblearn.over_sampling import SMOTE

from classifiers.clf_utils import grid_cv_model


In [None]:
#Load Dataset and Coaching Data
data = pd.read_csv('Data/Final Dataset.csv')

coaching_data = pd.read_csv('Data/Coaching Data.csv', skiprows = [0,1], skipfooter = 202)
coaching_data = coaching_data.rename(columns = {'FBS Team': 'Team'})
coaching_data = coaching_data[['Team','2019','2020','2021','2022','2023','2024']]
coaching_data = coaching_data.melt(id_vars='Team', var_name = 'Season', value_name = 'Coach')
coaching_data['Season'] = coaching_data['Season'].astype('int64')

In [25]:
#Define Coaching Change Function to Add Coaching Change Column to Data
def coach_change(row, data):

    team = str(row.Team)
    season = int(row.Season)

    try:
    
        curr_coach = data[(data['Team'] == team) & (data['Season'] == season)]['Coach']
        curr_coach = curr_coach[curr_coach.index[0]]
        next_coach = data[(data['Team'] == team) & (data['Season'] == season + 1)]['Coach']
        next_coach = next_coach[next_coach.index[0]]

        if curr_coach != next_coach:
            return 'Yes'
        return 'No'
        
    
    except:
        return 'N/A'

In [26]:
#Add coaching change info to data
data['Coach Change'] = data.apply(lambda x: coach_change(x, coaching_data), axis = 1)

In [27]:
positions = data.Position.unique()
positions

array(['RB', 'WR', 'QB', 'TE', 'LB', 'DB', 'OL', 'DL', 'CB', 'S', 'PK',
       'LS', 'P', 'DT', 'DE', 'FB', 'C', 'OT', 'G', 'NT', 'ATH', 'OLB',
       '?'], dtype=object)

In [28]:
#Add recruiting info
for i,year in enumerate(['2015','2016','2017','2018', '2019','2020', '2021', '2022', '2023']):
    file = pd.read_csv('Data/Player Recruit Ranking/' + year + '.csv')
    file.rename(columns = {'AthleteId': 'PlayerId', 'Year': 'Class of'}, inplace = True)
    #file.drop(columns = ['Year', 'Rating', 'Ranking'], inplace = True)
    #file.drop(columns = ['Year'], inplace = True)
    data = data.merge(file, on = 'PlayerId', how = 'left', suffixes = [None, '_' + str(i)])

In [29]:
#Combine Columns
for i in range(0,9):
    data['Stars'] = data['Stars'].combine_first(data['Stars_' + str(i)])
    data.drop(columns = ['Stars_' + str(i)], inplace = True)
for i in range(1,9):
    data['Rating'] = data['Rating'].combine_first(data['Rating_' + str(i)])
    data['Ranking'] = data['Ranking'].combine_first(data['Ranking_' + str(i)])
    data['Class of'] = data['Class of'].combine_first(data['Class of_' + str(i)])
    data.drop(columns = ['Rating_' + str(i)], inplace = True)
    data.drop(columns = ['Ranking_' + str(i)], inplace = True)
    data.drop(columns = ['Class of_' + str(i)], inplace = True)

data['Yr'] = data['Season'] - data['Class of'] + 1
data.drop(columns = ['Class of'], inplace = True)

In [30]:
position_groups = {'OL':['OL', 'NT', 'OT', 'G', 'C','FB'],
                   'TE':['TE'],
                   'QB':['QB'],
                   'RB':['RB'],
                   'WR':['WR'],
                   'DL':['DT', 'DE', 'DL'],
                   'DB':['DB', 'CB', 'S'],
                   'LB':['LB'],
                   'ST':['LS', 'P', 'PK']}

In [31]:
#Separate Data into Position Groups
data_sets = {}
for key in position_groups.keys():
    data_sets[key] = data.copy()[data.copy()['Position'].isin(position_groups[key])]

Testing with RB Data

In [32]:
rb = data_sets['RB'].copy().drop(columns = ['PositionId', 'ConferenceId', 'TeamId'])
#Fill NaN for Stars w/ Zero, since players can have Zero Star Rating and Yr w/ Zero for players w/o year information
rb['Stars'] = rb['Stars'].fillna(0)
rb['Yr'] = rb['Yr'].fillna(0)

In [33]:
#Engineer PCT Features
rb['Pct_Team_Rush_Yds'] = rb['YDS']/rb['Team rushingYards']
rb['Pct_Team_Rush_Attempts'] = rb['CAR']/rb['Team rushingAttempts']
rb['Pct_Team_Rush_TDs'] = rb['TD']/rb['Team rushingTDs']

In [34]:
#Define Improve Function
def improve(row, column, data):
    try: 
        id = int(row['PlayerId'])
        season = int(row['Season'])
        column = column
        imp = float(data[(data['PlayerId'] == id) & (data['Season'] == season)][column]) > float(data[(data['PlayerId'] == id) & (data['Season'] == season-1)][column])

        if imp:
            return 1 
        return -1
    except:
        return 0
 

In [35]:
#Engineer Improve Features
feats_to_improve = ['Usage Overall', 'Usage Rush', 'Usage Pass','Usage PassingDowns', 'Usage StandardDowns', 'Usage FirstDown', 'Usage SecondDown', 'Usage ThirdDown', 'AVG', 
         'CAR', 'YPC', 'REC', 'YPR', 'Pct_Team_Rush_Yds', 'Pct_Team_Rush_Attempts', 'Pct_Team_Rush_TDs']

for feat in feats_to_improve:
    rb[feat+'_improve'] = rb.apply(lambda x: improve(x, feat, rb), axis = 1)

  imp = float(data[(data['PlayerId'] == id) & (data['Season'] == season)][column]) > float(data[(data['PlayerId'] == id) & (data['Season'] == season-1)][column])
  imp = float(data[(data['PlayerId'] == id) & (data['Season'] == season)][column]) > float(data[(data['PlayerId'] == id) & (data['Season'] == season-1)][column])
  imp = float(data[(data['PlayerId'] == id) & (data['Season'] == season)][column]) > float(data[(data['PlayerId'] == id) & (data['Season'] == season-1)][column])
  imp = float(data[(data['PlayerId'] == id) & (data['Season'] == season)][column]) > float(data[(data['PlayerId'] == id) & (data['Season'] == season-1)][column])
  imp = float(data[(data['PlayerId'] == id) & (data['Season'] == season)][column]) > float(data[(data['PlayerId'] == id) & (data['Season'] == season-1)][column])
  imp = float(data[(data['PlayerId'] == id) & (data['Season'] == season)][column]) > float(data[(data['PlayerId'] == id) & (data['Season'] == season-1)][column])
  imp = float(data[(data['Pl

In [36]:
#Function to compare player stats with players of same year and star ranking
def compare(row, column, data):
    star = int(row['Stars'])
    year = int(row['Yr'])
    stat = column
    pos = str(row['Position'])
    id = int(row['PlayerId'])
    season = int(row['Season'])

    mean = data[(data['Stars'] == star) & (data['Yr'] == year) & (data['Position'] == pos)][stat].mean()
    p_stat = data[(data['PlayerId'] == id)&(data['Season'] == season)][stat]
    p_stat = p_stat[p_stat.index[0]]

    if int(p_stat>mean):
        return 1
    elif int(p_stat<mean):
        return -1
    else:
        return 0

In [37]:
feats_to_compare = ['Usage Overall', 'Usage Rush', 'Usage Pass','Usage PassingDowns', 'Usage StandardDowns', 'Usage FirstDown', 'Usage SecondDown', 'Usage ThirdDown', 'AVG', 
         'CAR', 'YPC', 'REC', 'YPR', 'Pct_Team_Rush_Yds', 'Pct_Team_Rush_Attempts', 'Pct_Team_Rush_TDs']

for feat in feats_to_compare:
    rb[feat + '_compare'] = rb.apply(lambda x: compare(x, feat, rb), axis = 1)

In [38]:
#Specify Relevant Columns
rel_feats = ['Usage Overall', 'Usage Rush','Usage StandardDowns', 'Usage FirstDown', 'Usage SecondDown', 'Usage ThirdDown', 'AVG', 
             'CAR', 'YPC', 'REC', 'YPR', 'Pct_Team_Rush_Yds', 'Pct_Team_Rush_Attempts', 'Pct_Team_Rush_TDs',
             'Usage Overall_improve', 'Usage Rush_improve', 'Usage Pass_improve',
             'Usage PassingDowns_improve', 'Usage StandardDowns_improve',
             'Usage FirstDown_improve', 'Usage SecondDown_improve',
             'Usage ThirdDown_improve', 'AVG_improve', 'CAR_improve', 'YPC_improve',
             'REC_improve', 'YPR_improve', 'Pct_Team_Rush_Yds_improve',
             'Pct_Team_Rush_Attempts_improve', 'Pct_Team_Rush_TDs_improve','Usage Overall_compare', 'Usage Rush_compare', 'Usage Pass_compare',
             'Usage PassingDowns_compare', 'Usage StandardDowns_compare',
             'Usage FirstDown_compare', 'Usage SecondDown_compare',
             'Usage ThirdDown_compare', 'AVG_compare', 'CAR_compare', 'YPC_compare',
             'REC_compare', 'YPR_compare', 'Pct_Team_Rush_Yds_compare',
             'Pct_Team_Rush_Attempts_compare', 'Pct_Team_Rush_TDs_compare',
             'Position','Yr','Stars', 'Coach Change', 'Ranking', 'Rating', 'Transfer_Portal']

#Get rid of 2019, narrow down to relevant features
rb = rb[rb['Season'] != 2019]
rb = rb[rel_feats]
#Encode Label Column
rb['Transfer_Portal'] = np.where(rb['Transfer_Portal'].values == 'Yes', 1, 0)

#Convert Yr and Stars to Categorical Variables
rb['Yr'] = rb['Yr'].astype('str')
rb['Stars'] = rb['Stars'].astype('str')




In [39]:
#Split into Training and Testing Data
X = rb.drop(columns=['Transfer_Portal'])
y = rb['Transfer_Portal']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.1,random_state=42, stratify=y)

#Specify the Numerical Features and Categorical Features
categorical = ['Yr', 'Stars','Position','Coach Change']

numerical = [feat for feat in X.columns if feat not in categorical]

#Pipeline to SimpleImpute and OneHot Encode Categorical Features (Training data only)
impute_encode = Pipeline([('impute',SimpleImputer(strategy='constant',fill_value='N/A')), ('encode',OneHotEncoder(handle_unknown='infrequent_if_exist'))])
column_transform = ColumnTransformer([('cat_encode', impute_encode, categorical), ('numerical_pass', SimpleImputer(strategy='constant',fill_value=0),numerical)])

X_train = column_transform.fit_transform(X_train)
col_trans = column_transform

#Fit Pipeline ColumnTransformer to testing features
X_test = column_transform.transform(X_test)

#SMOTE Balancing of Training Data
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train,y_train)



In [40]:
#CV Models w/ Training Data
cv = KFold(n_splits = 5, shuffle=True, random_state=42)

def classifiers():
    gb_clf = make_pipeline(StandardScaler(), GradientBoostingClassifier(random_state=42))
    SVM_clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
    forest_clf = RandomForestClassifier(random_state=42)

    gb_f1 = cross_val_score(gb_clf, X_train, y_train, cv=cv, scoring='f1')
    SVM_f1 = cross_val_score(SVM_clf, X_train, y_train, cv=cv, scoring='f1')
    forest_f1 = cross_val_score(forest_clf, X_train, y_train, cv=cv, scoring='f1')

    gb_pre = cross_val_score(gb_clf, X_train, y_train, cv=cv, scoring='precision')
    SVM_pre = cross_val_score(SVM_clf, X_train, y_train, cv=cv, scoring='precision')
    forest_pre = cross_val_score(forest_clf, X_train, y_train, cv=cv, scoring='precision')

    gb_re = cross_val_score(gb_clf, X_train, y_train, cv=cv, scoring='recall')
    SVM_re = cross_val_score(SVM_clf, X_train, y_train, cv=cv, scoring='recall')
    forest_re = cross_val_score(forest_clf, X_train, y_train, cv=cv, scoring='recall')

    print('F1 Scores:')
    print(gb_f1.mean())
    print(SVM_f1.mean())
    print(forest_f1.mean())
    print(' ')
    print('Precision Scores:')
    print(gb_f1.mean())
    print(SVM_pre.mean())
    print(forest_pre.mean())
    print(' ')
    print('Recall Scores:')
    print(gb_re.mean())
    print(SVM_re.mean())
    print(forest_re.mean())

classifiers()





F1 Scores:
0.9146637071543203
0.8821250336778288
0.9258144167801055
 
Precision Scores:
0.9146637071543203
0.8768966690016928
0.9565689439828187
 
Recall Scores:
0.8863992853051279
0.8876522380343858
0.8973049642861721


Comparing Base Models

In [41]:
#Dummy Classifier:

dummy = make_pipeline(StandardScaler(), DummyClassifier(random_state=42, strategy='stratified'))

dummy.fit(X_train, y_train)
preds = dummy.predict(X_test)


print('DummmyClassifier Metrics')
print('-----------------------------------')
print('F1 Score: ', f1_score(y_test, preds))
print('Precision Score: ', precision_score(y_test, preds))
print('Recall Score: ', recall_score(y_test, preds))
print('Accuracy: ', accuracy_score(y_test, preds))

DummmyClassifier Metrics
-----------------------------------
F1 Score:  0.2268041237113402
Precision Score:  0.14473684210526316
Recall Score:  0.5238095238095238
Accuracy:  0.4791666666666667


In [42]:
clf = make_pipeline(StandardScaler(), GradientBoostingClassifier(random_state=42))

clf.fit(X_train, y_train)
preds = clf.predict(X_test)

print('Base GradientBoostingClassifier Metrics')
print('-----------------------------------')
print('F1 Score: ', f1_score(y_test, preds))
print('Precision Score: ', precision_score(y_test, preds))
print('Recall Score: ', recall_score(y_test, preds))
print('Accuracy: ', accuracy_score(y_test, preds))

Base GradientBoostingClassifier Metrics
-----------------------------------
F1 Score:  0.48484848484848486
Precision Score:  0.6666666666666666
Recall Score:  0.38095238095238093
Accuracy:  0.8819444444444444


In [43]:

clf = make_pipeline(StandardScaler(), RandomForestClassifier(random_state=42))

clf.fit(X_train, y_train)
preds = clf.predict(X_test)

print('Base RandomForestClassifier Metrics')
print('-----------------------------------')
print('F1 Score: ', f1_score(y_test, preds))
print('Precision Score: ', precision_score(y_test, preds))
print('Recall Score: ', recall_score(y_test, preds))
print('Accuracy: ', accuracy_score(y_test, preds))

Base RandomForestClassifier Metrics
-----------------------------------
F1 Score:  0.35714285714285715
Precision Score:  0.7142857142857143
Recall Score:  0.23809523809523808
Accuracy:  0.875


In [44]:
clf = make_pipeline(StandardScaler(),LogisticRegression(random_state=42, class_weight='balanced'))

clf.fit(X_train, y_train)
preds = clf.predict(X_test)

print('Base LogisticRegression Metrics')
print('-----------------------------------')
print('F1 Score: ', f1_score(y_test, preds))
print('Precision Score: ', precision_score(y_test, preds))
print('Recall Score: ', recall_score(y_test, preds))
print('Accuracy: ', accuracy_score(y_test, preds))

Base LogisticRegression Metrics
-----------------------------------
F1 Score:  0.5
Precision Score:  0.38461538461538464
Recall Score:  0.7142857142857143
Accuracy:  0.7916666666666666


Hyperparameter Tuning to Identify Best Performing Model

In [45]:
#GB Hyperparameter Tune
RANDOM_STATE = 42
gb_clf = grid_cv_model(
     X=X_train,
     y=y_train,
     model= GradientBoostingClassifier(),
     params={
         "random_state": [RANDOM_STATE],
         "learning_rate": [0.01, 0.015, 0.02,0.05, 0.075, 0.1],
         "n_estimators": np.arange(1,100,1),
         "max_features":['sqrt', 'log2', None]
     },
     cv=5,
     scoring='recall'
 )

In [46]:
best_gb = gb_clf.best_estimator_
preds =  best_gb.predict(X_test)

print('Best GradientBoostingClassifier Metrics')
print('---------------------------------------')
print('Best Model:', best_gb)
print('F1 Score: ', f1_score(y_test, preds))
print('Precision Score: ', precision_score(y_test, preds))
print('Recall Score: ', recall_score(y_test, preds))
print('Accuracy: ', accuracy_score(y_test, preds))


Best GradientBoostingClassifier Metrics
---------------------------------------
Best Model: Pipeline(steps=[('scaler', StandardScaler()),
                ('model',
                 GradientBoostingClassifier(learning_rate=0.075,
                                            n_estimators=26,
                                            random_state=42))])
F1 Score:  0.6
Precision Score:  0.631578947368421
Recall Score:  0.5714285714285714
Accuracy:  0.8888888888888888


In [47]:
#RF Hyperparameter Tune
RANDOM_STATE = 42
rf_clf = grid_cv_model(
     X=X_train,
     y=y_train,
     model= RandomForestClassifier(class_weight = 'balanced'),
     params={
         "random_state": [RANDOM_STATE],
         "n_estimators": np.arange(1,100,1),
         "max_features":['sqrt', 'log2', None], 
         "criterion": ['gini', 'entropy', 'log_loss']
     },
     cv=5,
     scoring='recall'
 )


In [48]:
best_rf = rf_clf.best_estimator_
preds =  best_rf.predict(X_test)

print('Best RandomForestClassifier Metrics')
print('---------------------------------------')
print('Best Model:', best_rf)
print('F1 Score: ', f1_score(y_test, preds))
print('Precision Score: ', precision_score(y_test, preds))
print('Recall Score: ', recall_score(y_test, preds))
print('Accuracy: ', accuracy_score(y_test, preds))


Best RandomForestClassifier Metrics
---------------------------------------
Best Model: Pipeline(steps=[('scaler', StandardScaler()),
                ('model',
                 RandomForestClassifier(class_weight='balanced',
                                        max_features=None, n_estimators=7,
                                        random_state=42))])
F1 Score:  0.4444444444444444
Precision Score:  0.5333333333333333
Recall Score:  0.38095238095238093
Accuracy:  0.8611111111111112


In [49]:
#LR Hyperparameter Tune
RANDOM_STATE = 42
lr_clf = grid_cv_model(
     X=X_train,
     y=y_train,
     model= LogisticRegression(class_weight='balanced'),
     params={
         "random_state": [RANDOM_STATE],
         "penalty": ['l2'],
         "solver": ['lbfgs', 'liblinear'], 
         "C": [1, 5, 10, 100, 1000], 
         "max_iter": np.arange(10000, 11000, 100)
     },
     cv=5,
     scoring='recall'
 )

In [50]:
best_lr = lr_clf.best_estimator_
preds = best_lr.predict(X_test)

print('Best LogisticRegression Metrics')
print('---------------------------------------')
print('Best Model:', best_lr)
print('F1 Score: ', f1_score(y_test, preds))
print('Precision Score: ', precision_score(y_test, preds))
print('Recall Score: ', recall_score(y_test, preds))
print('Accuracy: ', accuracy_score(y_test, preds))

Best LogisticRegression Metrics
---------------------------------------
Best Model: Pipeline(steps=[('scaler', StandardScaler()),
                ('model',
                 LogisticRegression(C=1000, class_weight='balanced',
                                    max_iter=10000, random_state=42,
                                    solver='liblinear'))])
F1 Score:  0.5
Precision Score:  0.4
Recall Score:  0.6666666666666666
Accuracy:  0.8055555555555556


In [52]:
from joblib import dump, load
dump(best_gb, 'rb_classifier.joblib')
dump(col_trans, "column_transform_rb.joblib")

['column_transform_rb.joblib']

In [53]:
X_pos = rb[rb['Transfer_Portal'] == 1].drop(columns=['Transfer_Portal'])
X_neg = rb[rb['Transfer_Portal'] == 0].drop(columns=['Transfer_Portal'])

X_pos = column_transform.transform(X_pos)
X_neg = column_transform.transform(X_neg)

def average(nest_list):
    probs = []
    for i in nest_list:
        probs.append(i[1])

    return sum(probs)/len(nest_list)

log_pos_proba = average(best_lr.predict_proba(X_pos))
log_neg_proba = average(best_lr.predict_proba(X_neg))

gb_pos_proba = average(best_gb.predict_proba(X_pos))
gb_neg_proba = average(best_gb.predict_proba(X_neg))

print('Logistic Regression probabilities of a player entering the transfer portal')
print('---------------------------------------')
print('Average probability of players who did enter: ', log_pos_proba)
print('Average probability of players who did NOT enter: ', log_neg_proba)
print(' ')
print('GB probabilities of a player entering the transfer portal')
print('---------------------------------------')
print('Average probability of players who did enter: ', gb_pos_proba)
print('Average probability of players who did NOT enter: ', gb_neg_proba)

Logistic Regression probabilities of a player entering the transfer portal
---------------------------------------
Average probability of players who did enter:  0.6706973226988481
Average probability of players who did NOT enter:  0.28022822714815077
 
GB probabilities of a player entering the transfer portal
---------------------------------------
Average probability of players who did enter:  0.5024618972312866
Average probability of players who did NOT enter:  0.2731613590373331
