# Code using NFL Combine to predict your position based on measurables like height/weight/forty/etc.
## Also predict what pick# a player would be, based on position and measurables
## I like football, so was interested in exploring data
## Answer questions like: which players were over/under drafted due to measureables, which players may have done well in a different position
### Very simple models - nothing special
### I may try making an API so people can play more easily with what/if scenarios

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np 
import pandas as pd 
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

import lightgbm as lgb

In [None]:
df=pd.read_csv('../input/nfl-combine-data/combine_data_since_2000_PROCESSED_2018-04-26.csv')

# A little cleaning
### Make height in feet instead of inches because it speaks to me

In [None]:
df['Ht'] = df['Ht']/12
df['Ht'] = round(df['Ht'], 2)

In [None]:
df.head()

In [None]:
df['Pos'].unique()

# Replace some of these weird or general position names

## Seems there were different data sources or something, as ILB and MLB are the same thing
### Also get rid of long snappers, because they are an outlier and probably not ever drafted anyway

In [None]:
df['Pos'] = df['Pos'].replace({'G': 'OG', 'OL': 'OG', 'S': 'FS', 'DB': 'CB', 'EDGE': 'DE', 'P': 'P/K', 'K': 'P/K', 'LB': 'ILB', 'MLB': 'ILB'})
df = df.loc[df['Pos'] != 'LS']

In [None]:
df.columns

In [None]:
df.shape

# Keep only certain columns, make sure they are right data type

In [None]:
combinedata = df[['Pos','Ht', 'Wt', 'Forty', 'Vertical', 'BenchReps',
       'BroadJump', 'Cone', 'Shuttle', 'Year']]

combinedata[['Ht', 'Forty', 'Vertical', 
       'BroadJump', 'Cone', 'Shuttle']] = combinedata[['Ht', 'Forty', 'Vertical', 
       'BroadJump', 'Cone', 'Shuttle']].astype(float)

combinedata[['Wt', 'BenchReps','Year']] = combinedata[['Wt', 'BenchReps','Year']].astype('Int32')

combinedata['Pos'] = combinedata['Pos'].astype(str)

# Split out train/validation data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df, df['Pos'], test_size = 0.3, random_state = 42)

# Try out this LGB model, to guess what position a player is based on combine measurables

In [None]:
clf = lgb.LGBMClassifier()


In [None]:
clf.fit(X_train.drop(['Pos', 'Player', 'Pfr_ID', 'AV', 'Team', 'Round',
       'Pick'], axis=1), y_train, verbose=10)

In [None]:
y_pred=clf.predict(X_test.drop(['Pos', 'Player', 'Pfr_ID', 'AV', 'Team', 'Round',
       'Pick'], axis=1))
accuracy=accuracy_score(y_pred, y_test)
print('LightGBM Model accuracy score: {0:0.4f}'.format(accuracy_score(y_test, y_pred)))

# Write back out what is train/val so I can see

In [None]:
X_train['train_test'] = 'train'
X_test['train_test'] = 'test'

alldata = X_train.append(X_test).sort_values('Year')

# Now, put in my actual measureables to predict my ideal position
# You can change these measureables to see what position it predicts
## No surprise, for me it is a punter/kicker

In [None]:
merow = np.array([5.85, 175, 5.1, 24, 2,
       90, np.nan, np.nan, 2020]).reshape(1, -1)

me = pd.DataFrame(columns=['Ht', 'Wt', 'Forty', 'Vertical', 'BenchReps',
       'BroadJump', 'Cone', 'Shuttle', 'Year'], data=merow)

y_me=clf.predict(me)

y_me

## Now predict what position all the actual draft picks were, knowing only combine data

In [None]:
alldata['pred'] = clf.predict(alldata.drop(['Pos', 'Player', 'Pfr_ID', 'AV', 'Team', 'Round',
       'Pick', 'train_test'], axis=1))

In [None]:
alldata.head(20)

In [None]:
X_train['Pos'] = X_train['Pos'].astype('category')
X_test['Pos'] = X_test['Pos'].astype('category')

In [None]:
X_train.columns

# Now, create a model to predict what pick a player is, just based on position and combine measures
## Only train on drafted players, as its not going to predict a player is undrafted

In [None]:
hyper_params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'mape',
    'learning_rate': 0.1,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.7,
    'bagging_freq': 20,
    'verbose': 0,
    "max_depth": 8,
    #"num_leaves": 128,  
    #"max_bin": 512,
    #"num_iterations": 100000,
    "n_estimators": 1000
}

In [None]:
gbm = lgb.LGBMRegressor(**hyper_params)

In [None]:
X_train['Pos'] = X_train['Pos'].astype('category')
X_test['Pos'] = X_test['Pos'].astype('category')

In [None]:
gbm.fit(X_train.loc[X_train['Pick']>0].drop(['Player', 'Pfr_ID', 'AV', 'Team', 'Round', 'Year', 'train_test', 'Pick'], axis=1), X_train.loc[X_train['Pick']>0]['Pick'],
        eval_set=[(X_test.loc[X_test['Pick']>0].drop(['Player', 'Pfr_ID', 'AV', 'Team', 'Round', 'Year', 'train_test', 'Pick'], axis=1), X_test.loc[X_test['Pick']>0]['Pick'])],
        verbose=10,
        early_stopping_rounds=10)

# Could obviously use this model to predict what pick# a player would be, based on position and combine results

_________________________

# Predict the pick for all players that were actually drafted
## Let's see which players were best... no surprise that some players are near the top...
## good players like Cam Newton and Mario Williams, but also Combine stars like Logan Thomas and Matt Jones (the WR)

In [None]:
alldata['Pos'] = alldata['Pos'].astype('category')
alldata['predpick'] = gbm.predict(alldata.drop(['Player', 'Pick', 'Pfr_ID', 'AV', 'Team', 'Round',
        'Year', 'train_test', 'pred'], axis=1))

In [None]:
alldata.loc[alldata['Pick']>0].sort_values('predpick').head(20)

### Group positions for easier viewing

In [None]:
OL = ['C', 'OT', 'OG']
DL = ['NT', 'DT', 'DE']
LB = ['MLB', 'ILB', 'OLB']
DB = ['CB', 'SS', 'FS']

alldata['Pos Group'] = alldata['Pos'].astype(str)
alldata.loc[alldata['Pos Group'].isin(OL), 'Pos Group'] = 'OL'
alldata.loc[alldata['Pos Group'].isin(DL), 'Pos Group'] = 'DL'
alldata.loc[alldata['Pos Group'].isin(LB), 'Pos Group'] = 'LB'
alldata.loc[alldata['Pos Group'].isin(DB), 'Pos Group'] = 'DB'

## Actual draft position by Pos Group, and also how we fit real vs predicted
### Poor fullbacks and P/K get drafted later, but other positions are surprisingly close, with TE being the next worst-drafted

In [None]:

sns.pairplot(alldata, x_vars=['Pick', 'predpick'], y_vars=['Pick', 'predpick'], hue='Pos Group', height=8,
            kind='reg')

## Relationship between different combine measures, by position group

In [None]:
sns.pairplot(alldata, x_vars=['Forty', 'BenchReps', 'Ht', 'Wt'], y_vars=['Forty', 'BenchReps', 'Ht', 'Wt'], hue='Pos Group', height=6,
            plot_kws={'alpha': 0.5,"s": 100})

## How have forty times (blue) and player weight (orange) evolved over time, on average
### Seems a trend of faster and smaller players drafted peaked in 2008, went up, and is now lower

In [None]:
sns.lineplot(data=alldata, x='Year', y='Forty')
ax2 = plt.twinx()
plt.xticks(np.arange(min(alldata['Year']), max(alldata['Year'])+1, 1))
sns.lineplot(data=alldata, x='Year', y='Wt', ax=ax2, color='orange')


## Let's go into more detail in evolutions by each position

In [None]:
g = sns.FacetGrid(col="Pos", data=alldata,
           col_wrap=5, height=5)
g.map(sns.lineplot, 'Year', 'Forty', color='blue')
plt.show()

In [None]:
g = sns.FacetGrid(col="Pos", data=alldata,
           col_wrap=5, height=5)
g.map(sns.lineplot, 'Year', 'Wt', color='orange')
plt.show()

## Above, don't notice too many trends, except that TE have definitely been getting faster, and OT lighter

## Lets plot average forty speed and bench reps by round
## It is interesting that as bench goes up, round goes down, which is almost certainly due to more big guys being drafted later

In [None]:
sns.lineplot(data=alldata, x='Round', y='Forty')
ax2 = plt.twinx()
plt.xticks(np.arange(min(alldata['Round']), max(alldata['Round'])+1, 1))
sns.lineplot(data=alldata, x='Round', y='BenchReps', ax=ax2)

## Let's see that in detail.  First, forty time by round.  Clear impact for most positions.

In [None]:
g = sns.catplot(x="Pos Group", y="Forty", hue="Round",
            kind="bar", data=alldata, height=8, aspect=2)
g.ax.set_ylim(4.2, 5.5)

## But logically, bench reps mostly more impactful for line players: OL, DL, TE

In [None]:
g = sns.catplot(x="Pos Group", y="BenchReps", hue="Round",
            kind="bar", data=alldata, height=8, aspect=2)
#g.ax.set_ylim(4.2, 5.5)

### No surprise, in full position detail NT have the most reps, CB/WR the least (surprisingly below P/K, but probably only P/K who can do at least 1 rep try haha)

In [None]:
g = sns.catplot(x="Pos", y="BenchReps", 
            kind="bar", data=alldata, height=8, aspect=2)
#g.ax.set_ylim(4.2, 5.5)

In [None]:
alldata.to_csv("alldata.csv")