In [463]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import scipy as sp
import seaborn as sns
import numpy as np
import time

In [464]:
#read dataset
career = pd.read_csv('Data/career.csv')
career = career.set_index('Unnamed: 0',drop=True)
career.index.name= 'index'
career.shape

#set X and y
X = career.drop(['Player Id','Name','HOF'],axis=1)
y = career['HOF']
X.columns

Index(['RRTD', 'SB MVP', 'Receiving Yards Per Game', 'Rushing Yards adj',
       'MVP', 'PGWD', 'TD Passes adj', 'RRYd', 'Passing Yards Per Game', 'SB',
       'Position'],
      dtype='object')

In [465]:
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from imblearn.under_sampling import ClusterCentroids, RandomUnderSampler, NearMiss
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score,f1_score, auc, precision_recall_curve
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression

In [466]:
def sample_and_predict(sampler,clf,confusion=True,X=X,y=y):
    #split into train and test
    X_train, X_test, y_train, y_test = train_test_split(X,y)
    #resample
    if sampler == 'nosampler':
        Xtr, ytr = X_train, y_train
    else:
        Xtr, ytr = sampler.fit_sample(X_train,y_train)
    #fit classifier
    clf.fit(Xtr,ytr)
    #make predictions
    predictions = clf.predict(X_test)
    #get scores
    accuracy = accuracy_score(y_test,predictions)
    precision = precision_score(y_test,predictions)
    recall = recall_score(y_test,predictions)
    f1 = f1_score(y_test,predictions)
    #auc
    if confusion==True:
        #confusion matrix
        cm = confusion_matrix(y_test,predictions)
        #make dataframe of results
        df = pd.DataFrame(cm)
        df.index.name = 'Actual'
        df.T.index.name = 'Predicted'
        #plot it
        plt.style.use('fivethirtyeight')
        fig = plt.figure(figsize=(4,2))
        ax = plt.subplot(111)

        sns.heatmap(df,annot=True,fmt='g',cmap='Blues',annot_kws={"size":16,'fontweight':'bold'})
        ax.tick_params(axis='x',labelsize=16)
        ax.tick_params(axis='y',labelsize=16)
        plt.show()
        print('Accuracy: {:0.2%}'.format(accuracy))
        print('Precision: {:0.2%}'.format(precision))
        print('Recall: {:0.2%}'.format(recall))
        print(str(sampler).split('(')[0])
    return accuracy, precision, recall, f1

In [467]:
classifiers = [GradientBoostingClassifier(max_depth=7,max_features=.2,\
min_samples_leaf=5,min_samples_split=.1,subsample=.9)]
samplers = [RandomOverSampler(), SMOTE(),RandomUnderSampler(),'nosampler']
performance = pd.DataFrame(columns=['classifier','sampler','accuracy','precision','recall','f1','auc'])
count = 0
for a in np.arange(0,50):
    for c in classifiers:
        for s in samplers:
            
            performance.loc[count,'classifier'] = str(c).split('(')[0]
            performance.loc[count,'sampler'] = str(s).split('(')[0]
            performance.loc[count,'accuracy'], performance.loc[count,'precision'], \
            performance.loc[count,'recall'], performance.loc[count,'f1']\
            = sample_and_predict(s,c,confusion=False)
            count+=1
        
for a in ['accuracy','precision','recall','f1']:
    performance[a] = performance[a].astype(float)
performance.pivot_table(index=['classifier','sampler']).sort_values('f1',ascending=False)


Unnamed: 0_level_0,Unnamed: 1_level_0,accuracy,f1,precision,recall
classifier,sampler,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
GradientBoostingClassifier,RandomOverSampler,0.993878,0.863399,0.845099,0.895539
GradientBoostingClassifier,nosampler,0.993061,0.835078,0.862413,0.822773
GradientBoostingClassifier,SMOTE,0.99243,0.819035,0.78708,0.874396
GradientBoostingClassifier,RandomUnderSampler,0.962672,0.541948,0.379435,0.995238


In [325]:
really_hof = []
notreally_hof = []

feats = pd.DataFrame()
feats['features'] = X.columns
feats['importance'] = 0

### Test with best possible model

In [326]:
start= time.time()
for a in np.arange(0,20):
    clf = GradientBoostingClassifier()
    X_train, X_test, y_train, y_test = train_test_split(X,y)
    Xtr, ytr = SMOTE().fit_sample(X_train,y_train)
    clf.fit(Xtr,ytr)
    predictions = clf.predict(X_test)
    results = pd.DataFrame()
    results['Name'] = career.loc[y_test.index,'Name']
    results['truth'] = y_test
    results['prediction'] = predictions
    results['correct'] = results['truth'] == results['prediction']

    cond1 = results['correct']==False
    cond2 = results['truth']==1
    cond3 = results['truth']==0
    really_hof.extend(results['Name'][cond1 & cond2])
    notreally_hof.extend(results['Name'][cond1 & cond3])
    feats['importance']+= clf.feature_importances_
time.time()-start

4.548666000366211

In [353]:
pd.DataFrame(really_hof)[0].value_counts()[0:20]

Stallworth, John     4
Fouts, Dan           3
Moon, Warren         3
Davis, Terrell       3
Irvin, Michael       3
Aikman, Troy         3
Sayers, Gale         2
Kelly, Jim           2
Campbell, Earl       2
Carter, Cris         1
Griese, Bob          1
Little, Floyd        1
Tarkenton, Fran      1
Lofton, James        1
Swann, Lynn          1
Biletnikoff, Fred    1
Monk, Art            1
Riggins, John        1
Name: 0, dtype: int64

In [352]:
pd.DataFrame(notreally_hof)[0].value_counts()[0:20]

Watters, Ricky        10
Walker, Herschel       8
Jackson, Harold        7
Garrett, Mike          6
Ellard, Henry          4
Foreman, Chuck         4
Anderson, Donny        4
Anderson, Ken          3
Anderson, Ottis        3
Matte, Tom             3
Pearson, Drew          2
Alworth, Lance         2
Garrison, Gary         2
George, Eddie          2
McCauley, Don          2
Carmichael, Harold     2
Gabriel, Roman         1
Warner, Curt           1
Nance, Jim             1
Everett, Jim           1
Name: 0, dtype: int64

In [329]:
feats.sort_values('importance',ascending=False)

Unnamed: 0,features,importance
0,RRTD,5.200466
7,RRYd,3.851893
6,TD Passes adj,2.958514
3,Rushing Yards adj,2.150925
9,SB,2.139703
5,PGWD,1.213802
1,SB MVP,1.021583
4,MVP,0.727525
2,Receiving Yards Per Game,0.360438
8,Passing Yards Per Game,0.313903


In [335]:
def gridsearchin(clf,param_d,X,y):
    #split data into train and test
    X_train, X_test, y_train, y_test = train_test_split(X,y)
    #define and fit grid search clf
    gclf = GridSearchCV(clf,param_d)
    gclf.fit(X_train,y_train)
    paramcount = len(param_d)
    #make new df
    df = pd.DataFrame(gclf.cv_results_).iloc[:,np.arange(2,paramcount+4)]\
    .sort_values('mean_test_score',ascending=False).reset_index()
    newdf= pd.DataFrame(columns=['hyper-param','optimal_val'])
    count = 0
    for a in param_d.keys():
        value = df.groupby('param_'+a).mean().sort_values('mean_test_score',ascending=False).reset_index().iloc[0,0]
        newdf.loc[count,'hyper-param']=a
        newdf.loc[count,'optimal_val']=value
        count+=1
    return df, newdf

In [345]:
param_dict = {'max_depth': [5, 6, 7, 8],
 'max_features': [0.1, 0.2, 0.3, 0.4],
 'min_samples_leaf': [5, 10, 20, 50],
 'min_samples_split': [0.05, 0.075, 0.1],
 'subsample': [0.5, 0.75, 0.9, 1]}


clf = GradientBoostingClassifier()
df, newdf = gridsearchin(clf,param_dict,X,y)

In [346]:
newdf

Unnamed: 0,hyper-param,optimal_val
0,min_samples_split,0.075
1,max_features,0.2
2,subsample,0.9
3,max_depth,7.0


In [347]:
df.head()

Unnamed: 0,index,mean_test_score,mean_train_score,param_max_depth,param_max_features,param_min_samples_split,param_subsample
0,38,0.992565,1.0,7,0.3,0.075,0.9
1,8,0.991945,1.0,6,0.2,0.075,0.9
2,10,0.990706,1.0,6,0.2,0.1,0.9
3,25,0.990087,1.0,7,0.1,0.05,1.0
4,34,0.990087,1.0,7,0.2,0.1,0.9


# Test On Current NFL Players

In [468]:
#read dataset
currentplayers = pd.read_csv('Data/currentplayers.csv')
currentplayers = currentplayers.set_index('Unnamed: 0',drop=True)
currentplayers.index.name= 'index'

#set X and y
X_cur = currentplayers.drop(['Player Id','Name','HOF'],axis=1)
y_cur = currentplayers['HOF']

#fit model
clff = GradientBoostingClassifier(min_samples_leaf=5,min_samples_split=.1,max_features=.2,subsample=.9,max_depth=7)
clf2 = LogisticRegression()

names = ['Newton, Cam','Artis-Payne, Cameron','Barber, Peyton','Bray, Quan', 'Coates, Sammie', 'Grant, Corey',
         'Louis, Ricardo','Prosch, Jay','Williams, Carnell','Brown, Ronnie','Mason, Tre','Campbell, Jason',
         'Davis, Stephen', 'Jackson, Bo','Sullivan, Pat', 'Beasley, Terry', 'Brooks, James']
newdf = pd.DataFrame(columns=currentplayers.columns)
count=0
for a in currentplayers.index:
    name = currentplayers.loc[a,'Name']
    if name in names:
        player = currentplayers.loc[a]
        newdf.loc[count] = player
        count+=1
newdf


#set X and y
X_cur = newdf.drop(['Player Id','Name','HOF'],axis=1)
y_cur = newdf['HOF']

#fit model
clff = GradientBoostingClassifier(min_samples_leaf=5,min_samples_split=.1,max_features=.2,subsample=.9,max_depth=7)
clf2 = LogisticRegression()

In [469]:
def get_predictions(clf,X,y,X_test,fulldf):
    df = pd.DataFrame(columns=['Name','Prediction'])
    for a in np.arange(0,60):
        clf.fit(X,y)
        #define new df columns
        names = fulldf['Name']
        predictions = clf.predict_proba(X_test)[:,1]
        #make new df
        new_df = pd.DataFrame({'Name': names,
                          'Prediction': predictions})
        df = pd.concat([df,new_df])
    return df

In [484]:
predictions = get_predictions(clff,X,y,X_cur,currentplayers)
predictions = predictions.pivot_table(index='Name').sort_values('Prediction',ascending=False)

In [485]:
predictions

Unnamed: 0_level_0,Prediction
Name,Unnamed: 1_level_1
"Brady, Tom",0.999401
"Elway, John",0.999324
"Manning, Peyton",0.998763
"Montana, Joe",0.998674
"Bradshaw, Terry",0.998567
"Brown, Jim",0.998200
"Payton, Walter",0.998148
"Simpson, O.J.",0.997967
"Smith, Emmitt",0.997960
"Tomlinson, LaDainian",0.997807
