# Who should you take in the NFL draft? - QB Edition

## Quarterback

We can start with the quarter back position:

In [5]:
import pandas as pd
import numpy as np

from sklearn.cross_validation import cross_val_score
from sklearn import linear_model, ensemble, decomposition
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
sns.set()

from imblearn.over_sampling import SMOTE

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

%config InlineBackend.figure_format='retina'
matplotlib.rcParams['figure.figsize'] = (12.0, 8.0)

In [None]:
df = pd.read_csv('/Users/richard/data/NFL2.csv').drop(['Unnamed: 0', 'Rk', 'School'],1)

In [15]:
df.head()

Unnamed: 0,Year,Player,Pos,AV,College,Height,Wt,40YD,Vertical,BenchReps,Broad Jump,3Cone,Shuttle,Drafted (tm/rnd/yr),Probowl
0,2000,Sebastian Janikowski,K,53,College Stats,6-1,260,,,,,,,Oakland Raiders / 1st / 17th pick / 2000,1.0
1,2000,Jake Arians,K,0,,5-10,202,,,,,,,,0.0
2,2000,Doug Chapman,RB,3,College Stats,5-10,215,4.56,38.5,16.0,128.0,6.84,4.2,Minnesota Vikings / 3rd / 88th pick / 2000,0.0
3,2000,Kwame Cavil,WR,0,College Stats,6-2,208,4.54,39.5,,118.0,,,,0.0
4,2000,Trung Canidate,RB,11,College Stats,5-11,193,4.41,,18.0,,,,St. Louis Rams / 1st / 31st pick / 2000,0.0


In [16]:
df_qb = df[df['Pos'] == 'QB']
df_qb = df_qb.drop(df_qb[df_qb['Player']=='Player'].index)
df_qb['Height_inches'] = 12*df_qb['Height'].str.extract('([0-9]+)-([0-9]*\.?[0-9]+)')[0].astype(int) + df_qb['Height'].str.extract('([0-9]+)-([0-9]*\.?[0-9]+)')[1].astype(int)
df_qb = df_qb.drop(['Height'], 1)

df_qb = df_qb.drop(['BenchReps'], 1).dropna() # most QBs don't do benchreps!
df_qb[['Year','Wt', '40YD', 'Vertical', 'Broad Jump', '3Cone', 'Shuttle', 'Height_inches']] = df_qb[['Year','Wt', '40YD', 'Vertical', 'Broad Jump', '3Cone', 'Shuttle', 'Height_inches']].astype(np.number)

feature_cols = ['Wt', '40YD', 'Vertical', 'Broad Jump', '3Cone', 'Shuttle', 'Height_inches']

In [29]:
df = df.drop(df[df['Player']=='Player'].index)
df['Height_inches'] = 12*df['Height'].str.extract('([0-9]+)-([0-9]*\.?[0-9]+)')[0].astype(int) + df['Height'].str.extract('([0-9]+)-([0-9]*\.?[0-9]+)')[1].astype(int)
df = df.drop(['Height'], 1)

df = df.drop(['BenchReps'], 1).dropna() # most QBs don't do benchreps!
df[['Year','Wt', '40YD', 'Vertical', 'Broad Jump', '3Cone', 'Shuttle', 'Height_inches']] = df[['Year','Wt', '40YD', 'Vertical', 'Broad Jump', '3Cone', 'Shuttle', 'Height_inches']].astype(np.number)

feature_cols = ['Wt', '40YD', 'Vertical', 'Broad Jump', '3Cone', 'Shuttle', 'Height_inches']

It turns out that quarterbacks hardly ever do benchreps at the Combine! And also how many total players we have data on that made it to the probowl:

In [30]:
df.groupby('Probowl').size()

Probowl
0.0    1249
1.0     164
dtype: int64

13 players out of our 90 made it to the probowl. Holy smokes batman I don't think we have enough data!

In [None]:
cutoff_year = 2010

df_train   = df[df['Year'] < cutoff_year]
df_test = df[df['Year'] >= cutoff_year]

X_train = df_train[feature_cols]
y_train = df_train.Probowl

X_test = df_test[feature_cols]
y_test = df_test.Probowl

print(len(y_train), len(y_test))

In [31]:
cutoff_year = 2010

df_train   = df[df['Year'] < cutoff_year]
df_test = df[df['Year'] >= cutoff_year]

X_train = df_train[feature_cols]
y_train = df_train.Probowl

X_test = df_test[feature_cols]
y_test = df_test.Probowl

print(len(y_train), len(y_test))

789 624


In [32]:
1 - df_train.groupby('Probowl').size()[1]/df_train.groupby('Probowl').size()[0]

0.8581765557163531

In [33]:
lr = linear_model.LogisticRegression()
rf = ensemble.RandomForestClassifier(n_jobs=8)
scores = cross_val_score(rf, X_train, y_train, cv = 10, scoring='accuracy')
print(np.round(scores,2))

[ 0.86  0.87  0.87  0.85  0.86  0.86  0.85  0.89  0.87  0.87]


In [34]:
rf.fit(X_train, y_train)


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=8,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [36]:
arr = np.zeros((len(y_test),4))

arr[:,0] = np.array(np.round(rf.predict_proba(X_test)[:,0],2))
arr[:,1] = np.array(np.round(rf.predict_proba(X_test)[:,1],2))
arr[:,2] = np.array(rf.predict(X_test))
arr[:,3] = np.array(y_test)

results = pd.DataFrame(arr, columns=['non probowl prob', 'probowl prob', 'prediction', 'actual'])

results.sort('predicted',ascending=False)

Unnamed: 0,non probowl prob,probowl prob,prediction,actual
623,1.0,0.0,0.0,1.0
488,0.8,0.2,0.0,1.0
235,0.9,0.1,0.0,1.0
237,1.0,0.0,0.0,1.0
49,0.9,0.1,0.0,1.0
95,0.7,0.3,0.0,1.0
209,0.8,0.2,0.0,1.0
369,0.9,0.1,0.0,1.0
240,0.9,0.1,0.0,1.0
552,0.6,0.4,0.0,1.0
