In [46]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.utils._testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.dummy import DummyClassifier

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline


In [45]:
#Load Dataset and Coaching Data
data = pd.read_csv('Data/Final Dataset.csv')

coaching_data = pd.read_csv('Data/Coaching Data.csv', skiprows = [0,1], skipfooter = 202)
coaching_data = coaching_data.rename(columns = {'FBS Team': 'Team'})
coaching_data = coaching_data[['Team','2019','2020','2021','2022','2023','2024']]
coaching_data = coaching_data.melt(id_vars='Team', var_name = 'Season', value_name = 'Coach')
coaching_data['Season'] = coaching_data['Season'].astype('int64')

In [5]:
#Define Coaching Change Function to Add Coaching Change Column to Data
def coach_change(row, data):

    team = str(row.Team)
    season = int(row.Season)

    try:
    
        curr_coach = data[(data['Team'] == team) & (data['Season'] == season)]['Coach']
        curr_coach = curr_coach[curr_coach.index[0]]
        next_coach = data[(data['Team'] == team) & (data['Season'] == season + 1)]['Coach']
        next_coach = next_coach[next_coach.index[0]]

        if curr_coach != next_coach:
            return 'Yes'
        return 'No'
        
    
    except:
        return 'No'

In [6]:
#Add coaching change info to data
data['Coach Change'] = data.apply(lambda x: coach_change(x, coaching_data), axis = 1)

In [7]:
positions = data.Position.unique()
positions

array(['RB', 'WR', 'QB', 'TE', 'LB', 'DB', 'OL', 'DL', 'CB', 'S', 'PK',
       'LS', 'P', 'DT', 'DE', 'FB', 'C', 'OT', 'G', 'NT', 'ATH', 'OLB',
       '?'], dtype=object)

In [8]:
#Add recruiting info
for i,year in enumerate(['2015','2016','2017','2018', '2019','2020', '2021', '2022', '2023']):
    file = pd.read_csv('Data/Player Recruit Ranking/' + year + '.csv')
    file.rename(columns = {'AthleteId': 'PlayerId', 'Year': 'Class of'}, inplace = True)
    data = data.merge(file, on = 'PlayerId', how = 'left', suffixes = [None, '_' + str(i)])

In [9]:
#Combine Columns
for i in range(0,9):
    data['Stars'] = data['Stars'].combine_first(data['Stars_' + str(i)])
    data.drop(columns = ['Stars_' + str(i)], inplace = True)
for i in range(1,9):
    data['Rating'] = data['Rating'].combine_first(data['Rating_' + str(i)])
    data['Ranking'] = data['Ranking'].combine_first(data['Ranking_' + str(i)])
    data['Class of'] = data['Class of'].combine_first(data['Class of_' + str(i)])
    data.drop(columns = ['Rating_' + str(i)], inplace = True)
    data.drop(columns = ['Ranking_' + str(i)], inplace = True)
    data.drop(columns = ['Class of_' + str(i)], inplace = True)

data['Yr'] = data['Season'] - data['Class of'] + 1
data.drop(columns = ['Class of'], inplace = True)

In [10]:
position_groups = {'OL':['OL', 'NT', 'OT', 'G', 'C','FB'],
                   'TE':['TE'],
                   'QB':['QB'],
                   'RB':['RB'],
                   'WR':['WR'],
                   'DL':['DT', 'DE', 'DL'],
                   'DB':['DB', 'CB', 'S'],
                   'LB':['LB'],
                   'ST':['LS', 'P', 'PK']}

In [11]:
#Separate Data into Position Groups
data_sets = {}
for key in position_groups.keys():
    data_sets[key] = data.copy()[data.copy()['Position'].isin(position_groups[key])]

Testing Defensive data as a whole

In [14]:
DL = data_sets['DL'].copy().drop(columns = ['PositionId', 'ConferenceId', 'TeamId'])
DB = data_sets['DB'].copy().drop(columns = ['PositionId', 'ConferenceId', 'TeamId'])
LB = data_sets['LB'].copy().drop(columns = ['PositionId', 'ConferenceId', 'TeamId'])

defense = pd.concat([DL,DB,LB])

In [15]:
defense.columns

Index(['Season', 'PlayerId', 'Player', 'Position', 'Team', 'Conference',
       'Usage Overall', 'Usage Pass', 'Usage Rush', 'Usage FirstDown',
       'Usage SecondDown', 'Usage ThirdDown', 'Usage StandardDowns',
       'Usage PassingDowns', 'ATT', 'AVG', 'CAR', 'COMPLETIONS', 'FGA', 'FGM',
       'FUM', 'INT', 'In 20', 'LONG', 'LOST', 'NO', 'PCT', 'PD', 'PTS',
       'QB HUR', 'REC', 'SACKS', 'SOLO', 'TB', 'TD', 'TFL', 'TOT', 'XPA',
       'XPM', 'YDS', 'YPA', 'YPC', 'YPP', 'YPR', 'Division', 'ExpectedWins',
       'Total Games', 'Total Wins', 'Total Losses', 'Total Ties',
       'ConferenceGames Games', 'ConferenceGames Wins',
       'ConferenceGames Losses', 'ConferenceGames Ties', 'HomeGames Games',
       'HomeGames Wins', 'HomeGames Losses', 'HomeGames Ties',
       'AwayGames Games', 'AwayGames Wins', 'AwayGames Losses',
       'AwayGames Ties', 'Team firstDowns', 'Team fourthDownConversions',
       'Team fourthDowns', 'Team fumblesLost', 'Team fumblesRecovered',
       'Team g

In [16]:
#Engineer PCT Features
defense['Pct_Team_INT'] = defense['INT']/defense['Team interceptions']
defense['Pct_Team_SACKS'] = defense['SACKS']/defense['Team sacks']
defense['Pct_Team_TFL'] = defense['TFL']/defense['Team tacklesForLoss']

In [17]:
#Define Improve Function
def improve(row, column, data):
    try: 
        id = int(row['PlayerId'])
        season = int(row['Season'])
        column = column
        imp = float(data[(data['PlayerId'] == id) & (data['Season'] == season)][column]) > float(data[(data['PlayerId'] == id) & (data['Season'] == season-1)][column])

        if imp:
            return 'Yes' 
        return 'No'
    except:
        return 'Yes'
 

In [44]:
#Engineer Improve Features
feats_to_improve = ['FUM', 'INT', 'PD', 'QB HUR', 'SACKS', 'SOLO', 'Pct_Team_INT', 'Pct_Team_SACKS', 'Pct_Team_TFL']

for feat in feats_to_improve:
    defense[feat+'_improve'] = defense.apply(lambda x: improve(x, feat, defense), axis = 1)

In [34]:
#Specify Relevant Columns
#No team stats added as of right now
rel_feats = ['FUM', 'INT', 'PD', 'QB HUR', 'SACKS', 'SOLO', 'Pct_Team_INT', 'Pct_Team_SACKS', 'Pct_Team_TFL',
             'FUM_improve', 'INT_improve', 'PD_improve', 'QB HUR_improve', 'SACKS_improve', 'SOLO_improve', 
             'Pct_Team_INT_improve', 'Pct_Team_SACKS_improve', 'Pct_Team_TFL_improve', 
             'Team','Conference','Position','Yr','Stars', 'Coach Change', 'Ranking', 'Rating', 'Transfer_Portal']


#Get rid of 2019, narrow down to relevant features
defense = defense[defense['Season'] != 2019]
defense = defense[rel_feats]
#Encode Label Columns
defense['Transfer_Portal'] = np.where(defense['Transfer_Portal'].values == 'Yes', 1, 0)

#Convert Yr and Stars to Categorical Variables
defense['Yr'] = defense['Yr'].astype('str')
defense['Stars'] = defense['Stars'].astype('str')




In [35]:
# #Split into Training and Testing Data
# X = defense.drop(columns=['Transfer_Portal'])
# y = defense['Transfer_Portal']

# X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.33,random_state=42)

# #Specify the Numerical Features and Categorical Features
# categorical = ['FUM_improve', 'INT_improve', 'PD_improve', 'QB HUR_improve', 'SACKS_improve', 'SOLO_improve', 
#              'Pct_Team_INT_improve', 'Pct_Team_SACKS_improve', 'Pct_Team_TFL_improve', 
#              'Team','Conference','Position','Yr','Stars', 'Coach Change']

# numerical = [feat for feat in X.columns if feat not in categorical]

# #Pipeline to SimpleImpute and OneHot Encode Categorical Features (Training data only)
# impute_encode = Pipeline([('impute',SimpleImputer(strategy='constant',fill_value='N/A')), ('encode',OneHotEncoder(handle_unknown='ignore'))])
# column_transform = ColumnTransformer([('cat_encode', impute_encode, categorical), ('numerical_pass', SimpleImputer(strategy='constant',fill_value=0),numerical)])

# X_train = column_transform.fit_transform(X_train)

# #Fit Pipeline ColumnTransformer to testing features
# X_test = X_test.fillna(0)
# X_test = column_transform.transform(X_test)

# #SMOTE Balancing of Training Data
# smote = SMOTE(random_state=42)
# X_train, y_train = smote.fit_resample(X_train,y_train)



The reason I commented out the above section was because it led to me using only the X_train and y_train data for cross validation.
In the below cell I used the entire dataset, and split the data later

In [63]:
X = defense.drop(columns=['Transfer_Portal'])
y = defense['Transfer_Portal']

#Specify the Numerical Features and Categorical Features
categorical = ['FUM_improve', 'INT_improve', 'PD_improve', 'QB HUR_improve', 'SACKS_improve', 'SOLO_improve', 
             'Pct_Team_INT_improve', 'Pct_Team_SACKS_improve', 'Pct_Team_TFL_improve', 
             'Team','Conference','Position','Yr','Stars', 'Coach Change']

numerical = [feat for feat in X.columns if feat not in categorical]

#Pipeline to SimpleImpute and OneHot Encode Categorical Features (Training data only)
impute_encode = Pipeline([('impute',SimpleImputer(strategy='constant',fill_value='N/A')), ('encode',OneHotEncoder(handle_unknown='ignore'))])
column_transform = ColumnTransformer([('cat_encode', impute_encode, categorical), ('numerical_pass', SimpleImputer(strategy='constant',fill_value=0),numerical)])

X = column_transform.fit_transform(X)

#SMOTE Balancing of Training Data
smote = SMOTE(random_state=42)
X, y = smote.fit_resample(X,y)

In [65]:
#CV Models w/ Training Data
cv = KFold(n_splits = 5)

@ignore_warnings(category=ConvergenceWarning)
def classifiers():
    gb_clf = make_pipeline(StandardScaler(), GradientBoostingClassifier(random_state=42))
    SVM_clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
    forest_clf = RandomForestClassifier(random_state=42)
    log_clf = make_pipeline(StandardScaler(), LogisticRegression(max_iter=10000, class_weight='balanced'))

    gb_f1 = cross_val_score(gb_clf, X.toarray(), y, cv=cv, scoring='f1')
    SVM_f1 = cross_val_score(SVM_clf, X.toarray(), y, cv=cv, scoring='f1')
    forest_f1 = cross_val_score(forest_clf, X.toarray(), y, cv=cv, scoring='f1')
    log_f1 = cross_val_score(log_clf, X.toarray(), y, cv=cv, scoring='f1')

    gb_pre = cross_val_score(gb_clf, X.toarray(), y, cv=cv, scoring='precision')
    SVM_pre = cross_val_score(SVM_clf, X.toarray(), y, cv=cv, scoring='precision')
    forest_pre = cross_val_score(forest_clf, X.toarray(), y, cv=cv, scoring='precision')
    log_pre = cross_val_score(log_clf, X.toarray(), y, cv=cv, scoring='precision')

    gb_re = cross_val_score(gb_clf, X.toarray(), y, cv=cv, scoring='recall')
    SVM_re = cross_val_score(SVM_clf, X.toarray(), y, cv=cv, scoring='recall')
    forest_re = cross_val_score(forest_clf, X.toarray(), y, cv=cv, scoring='recall')
    log_re = cross_val_score(log_clf, X.toarray(), y, cv=cv, scoring='recall')

    print('F1 Scores:')
    print(gb_f1.mean())
    print(SVM_f1.mean())
    print(forest_f1.mean())
    print(log_f1.mean())
    print(' ')
    print('Precision Scores:')
    print(gb_pre.mean())
    print(SVM_pre.mean())
    print(forest_pre.mean())
    print(log_pre.mean())
    print(' ')
    print('Recall Scores:')
    print(gb_re.mean())
    print(SVM_re.mean())
    print(forest_re.mean())
    print(log_re.mean())

classifiers()

F1 Scores:
0.6115639851019453
0.6614724094435026
0.6531328813168102
0.5646135407849021
 
Precision Scores:
0.5438576951350073
0.5953573057288597
0.585133210427884
0.522724862852422
 
Recall Scores:
0.9456965275546295
0.9663177877239593
0.9421447638197037
0.946865964122604


In [66]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.33,random_state=42)

In [67]:
#Evaluate trained model on test data
#Use 'weighted' f1 score, recall, precision since data is imbalanced

clf = RandomForestClassifier(random_state=42, class_weight = 'balanced')

clf.fit(X_train, y_train)
preds = clf.predict(X_test)

print('Base RandomForestClassifier Metrics')
print('-----------------------------------')
print('F1 Score: ', f1_score(y_test, preds))
print('Precision Score: ', precision_score(y_test, preds))
print('Recall Score: ', recall_score(y_test, preds))
print('Accuracy: ', accuracy_score(y_test, preds))

Base RandomForestClassifier Metrics
-----------------------------------
F1 Score:  0.9713322091062394
Precision Score:  0.9990089197224975
Recall Score:  0.9451476793248945
Accuracy:  0.9723897911832947


In [71]:
#Evaluate trained model on test data

clf = GradientBoostingClassifier(random_state=42)

clf.fit(X_train.toarray(), y_train)
preds = clf.predict(X_test.toarray())

print('Base GradientBoostingClassifer Metrics')
print('---------------------------------------')
print('F1 Score: ', f1_score(y_test, preds))
print('Precision Score: ', precision_score(y_test, preds))
print('Recall Score: ', recall_score(y_test, preds))
print('Accuracy: ', accuracy_score(y_test, preds))

Base GradientBoostingClassifer Metrics
---------------------------------------
F1 Score:  0.9688781664656212
Precision Score:  0.9980119284294234
Recall Score:  0.9413970932958274
Accuracy:  0.9700696055684455


In [69]:
#Evaluate trained model on test data
#Use 'weighted' f1 score, recall, precision since data is imbalanced

clf = LogisticRegression(max_iter=10000, class_weight='balanced')

clf.fit(X_train.toarray(), y_train)
preds = clf.predict(X_test.toarray())

print('Base LogisticRegressionClassifer Metrics')
print('---------------------------------------')
print('F1 Score: ', f1_score(y_test, preds))
print('Precision Score: ', precision_score(y_test, preds))
print('Recall Score: ', recall_score(y_test, preds))
print('Accuracy: ', accuracy_score(y_test, preds))

Base LogisticRegressionClassifer Metrics
---------------------------------------
F1 Score:  0.843859649122807
Precision Score:  0.7927482488669139
Recall Score:  0.9020159399906236
Accuracy:  0.8348027842227378


In [70]:
#Dummy Classifier:

dummy = make_pipeline(StandardScaler(), DummyClassifier(random_state=42, strategy='stratified'))

dummy.fit(X_train.toarray(), y_train)
preds = dummy.predict(X_test.toarray())

print('DummmyClassifier Metrics')
print('-----------------------------------')
print('F1 Score: ', f1_score(y_test, preds))
print('Precision Score: ', precision_score(y_test, preds))
print('Recall Score: ', recall_score(y_test, preds))
print('Accuracy: ', accuracy_score(y_test, preds))

DummmyClassifier Metrics
-----------------------------------
F1 Score:  0.5012811553692057
Precision Score:  0.4981481481481482
Recall Score:  0.5044538209095171
Accuracy:  0.5032482598607889


In [None]:
#GridSearch/Optimize/Hyperparameter Tune

#Sklearn Job-lib

#Full player data from collegefootballapi

We will continue to test and tune the logistic regression and gradient boost models