In [1]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)


In [4]:
measurables = pd.read_csv("data/Measurables.csv")
production = pd.read_csv("data/Production.csv")

In [5]:
'''
Create Career NFL Production Ranking 
''' 

nfl_production = production.copy(deep=True)
nfl_production['GP%'] = nfl_production['GamesPlayed']/(1*16)
nfl_production['GS%'] = nfl_production['GamesStarted']/(1*16)
nfl_production['PosPlay%'] = nfl_production['PositivePlays']/nfl_production['Plays']
nfl_production['NegPlay%'] = nfl_production['NegativePlays']/nfl_production['Plays']
nfl_production['NeutPlay%'] = (nfl_production['Plays'] - nfl_production['PositivePlays'] - nfl_production['NegativePlays'])/nfl_production['Plays']

In [6]:
alpha_1 = 0.7
alpha_2 = 0.3
beta_1 = 2
beta_2 = 0.3
beta_3 = -2
w1=0.2
w2=0.8

nfl_production['Season_Score'] = w1*(alpha_1*nfl_production['GP%'] + alpha_2*nfl_production['GS%']) + \
                            w2*(beta_1*nfl_production['PosPlay%'] + beta_2*nfl_production['NeutPlay%'] + beta_3*nfl_production['NegPlay%'])

In [37]:
# save career rankings
nfl_production.groupby("PlayerId").sum().sort_values("Season_Score", ascending=False)[['Season_Score']].to_csv("model_data/nfl_desirability.csv")

In [39]:
# calculate next season correlation
df_sorted = nfl_production.sort_values(by=['PlayerId', 'Season'])
df_sorted['Next_Season_Score'] = df_sorted.groupby("PlayerId")['Season_Score'].shift(-1)
df_sorted = df_sorted.dropna(axis=0)

pairs = df_sorted.dropna(axis=0).loc[:,['Season_Score', 'Next_Season_Score']].to_numpy()

In [44]:
df_sorted_clipped = df_sorted.groupby("PlayerId").apply(lambda x : x.iloc[-4:])
clipped_pairs = df_sorted_clipped.dropna(axis=0).loc[:,['Season_Score', 'Next_Season_Score']].to_numpy()

In [43]:
print(df_sorted.shape)
print(df_sorted_clipped.shape)

(4041, 14)
(3494, 14)


In [45]:
np.corrcoef(clipped_pairs[:,0],clipped_pairs[:,1])

array([[1.        , 0.26273957],
       [0.26273957, 1.        ]])

In [40]:
np.corrcoef(pairs[:,0],pairs[:,1])

array([[1.       , 0.2269327],
       [0.2269327, 1.       ]])

In [2]:
''' 
Create NFL Production Score (target for prospect model)
'''

' \nCreate NFL Production Score\n'

In [4]:
trimmed_production = production.groupby("PlayerId", group_keys=False).apply(lambda g : g.sort_values("Season").head(4))
final_production = pd.DataFrame(index=trimmed_production.PlayerId.unique())

In [5]:
final_production['num_seasons'] = trimmed_production.groupby("PlayerId").Season.count()
# get all total values from production
final_production = pd.merge(final_production, trimmed_production.groupby("PlayerId").sum().drop("Season", axis=1), left_index=True, right_index=True)

In [6]:
final_production['GP%'] = final_production['GamesPlayed']/(final_production['num_seasons']*16)
final_production['GS%'] = final_production['GamesStarted']/(final_production['num_seasons']*16)
final_production['PosPlay%'] = final_production['PositivePlays']/final_production['Plays']
final_production['NegPlay%'] = final_production['NegativePlays']/final_production['Plays']
final_production['NeutPlay%'] = (final_production['Plays'] - final_production['PositivePlays'] - final_production['NegativePlays'])/final_production['Plays']

In [7]:
alpha_1 = 0.7
alpha_2 = 0.3
beta_1 = 2
beta_2 = 0.3
beta_3 = -2
w1=0.2
w2=0.8

final_production['Score'] = w1*(alpha_1*final_production['GP%'] + alpha_2*final_production['GS%']) + \
                            w2*(beta_1*final_production['PosPlay%'] + beta_2*final_production['NeutPlay%'] + beta_3*final_production['NegPlay%'])

In [8]:
'''
Clean/Engineer measurables/college data
'''

'\nClean/Engineer measurables/college data\n'

In [9]:
measurables.set_index("PlayerId",drop=True,inplace=True)

positions_to_remove = ['ST', 'RS', 'NT', 'LS', 'PK', 'PT']
cols_to_keep = ['ProPosition', 'Hgt', 'Wgt', 'Age', 'Forty', 'Arm', 'Hand', 'Wing', 'TenYard', 'TwentyYard',
                'ThreeCone', 'VJ', 'BJ', 'TwentyShuttle', 'SixtyShuttle', 'BP', 'Test_Acc%', 'IndyInvite']

measurables['Test_Acc%'] = (measurables.Test/measurables.TestAttempt).round(4)

filtered_measurables = measurables.query("ProPosition not in @positions_to_remove")
filtered_measurables = filtered_measurables[cols_to_keep]


In [10]:
dataset = pd.merge(final_production, filtered_measurables, how='left', right_index=True, left_index=True)

In [11]:
dataset = dataset[~dataset.ProPosition.isna()]

In [12]:
group_quantiles = dataset.groupby("ProPosition").Score.quantile(0.7)

In [13]:
def mark_success(group):
    percentile_threshold = group_quantiles[group.ProPosition]
    return (group['Score'] > percentile_threshold).astype(int)
    #return group

In [14]:
dataset['Success'] = dataset.apply(mark_success,axis=1)

In [15]:
dataset = dataset.query("Plays > 50")

In [16]:
#dataset.to_csv("model_data/input_dataset.csv")

This is an artifact of the distribution of Positive Plays and Negative Plays - RBs have the highest percentage of positive plays (PosPlay%).

In [17]:
list_of_val_players = [
    82000,
    10081, 
    63510,   # same as 10018 but fewer total plays
    10018,   # not a lot of games started, but lot of positive plays
    89301,   # close to even pos/neg ratio, but high GP & GS
    28799,   # more games started, but worse positive/negative ratio
    90244,   # low games started, slightly pos pos/neg ratio
    68164,   # not a lot of games, not a lot of starts, more negative plays
    20346   # lot of games played, high games started, more negative plays
    ]

In [18]:
dataset.loc[list_of_val_players,:]

Unnamed: 0,num_seasons,GamesPlayed,GamesStarted,Plays,PositivePlays,NegativePlays,GP%,GS%,PosPlay%,NegPlay%,NeutPlay%,Score,ProPosition,Hgt,Wgt,Age,Forty,Arm,Hand,Wing,TenYard,TwentyYard,ThreeCone,VJ,BJ,TwentyShuttle,SixtyShuttle,BP,Test_Acc%,IndyInvite,Success
82000,4,61,52,2927,169,57,0.953125,0.8125,0.057738,0.019474,0.922788,0.46488,WR,77.0,240.0,23.2,4.63,34.875,10.375,84.0,1.67,2.68,7.33,32.5,9.11,4.39,12.08,13.0,0.3333,Y,1
10081,4,52,48,1905,279,117,0.8125,0.75,0.146457,0.061417,0.792126,0.484923,DT,75.88,336.0,21.3,5.28,33.125,10.125,79.625,1.9,3.03,7.62,24.5,,4.82,,19.0,0.9412,Y,1
63510,4,51,3,845,65,23,0.796875,0.046875,0.076923,0.027219,0.895858,0.408908,OLB,75.38,255.0,21.5,4.58,34.5,10.125,81.25,1.58,2.61,7.14,33.0,9.11,4.4,,27.0,0.587,Y,0
10018,3,40,2,1141,94,53,0.833333,0.041667,0.082384,0.04645,0.871166,0.38574,DT,74.88,293.0,23.4,4.79,31.5,9.375,76.375,1.68,2.79,7.23,29.0,9.02,4.37,,29.0,0.5652,Y,0
89301,4,64,64,4237,304,288,1.0,1.0,0.071749,0.067973,0.860278,0.412509,OC,75.63,312.0,22.1,5.56,33.0,10.0,79.0,1.94,3.19,7.81,28.5,8.01,4.76,,21.0,0.68,Y,1
28799,4,63,21,1718,124,122,0.984375,0.328125,0.072177,0.071013,0.85681,0.364997,TE,77.63,255.0,23.0,4.94,32.125,9.5,77.375,1.75,2.89,6.9,31.5,9.02,4.25,11.84,16.0,0.6765,N,1
90244,3,24,3,348,39,33,0.5,0.0625,0.112069,0.094828,0.793103,0.291681,DT,73.25,337.0,23.2,5.45,31.75,9.375,77.5,1.91,3.09,7.89,26.0,7.04,4.6,,32.0,0.6667,Y,0
68164,4,51,20,1976,67,143,0.796875,0.3125,0.033907,0.072368,0.893725,0.283268,DS,71.5,206.0,22.0,4.56,31.75,9.5,76.0,1.6,2.63,6.72,37.5,10.06,4.22,11.94,15.0,0.439,Y,0
20346,4,19,8,440,10,43,0.296875,0.125,0.022727,0.097727,0.879545,0.140153,DC,70.13,185.0,22.4,4.5,29.125,9.0,69.0,1.56,2.6,6.97,34.5,9.1,4.03,11.58,19.0,0.7297,Y,0


In [21]:
dataset.Score.describe()

count    1446.000000
mean        0.322457
std         0.133960
min        -0.671250
25%         0.254648
50%         0.321096
75%         0.396471
max         0.955000
Name: Score, dtype: float64