# Check the coverage (calibration) of the 95th quantile predictions
## If well calibrated we should expect about 95% of the true AV values to be less than or equal to the predictions

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import LeaveOneGroupOut

from sklearn.metrics import make_scorer
from sklearn.externals import joblib


In [2]:
train_df = pd.read_csv("processed_data/rb_train_data.csv")

In [3]:
model = joblib.load("models/best_model_04.01.pkl")

In [4]:
# create yearly average rushing stats
rush_per_yr_cols = ["Rush_Att_per_Yr", "Rush_Yds_per_Yr", "Rush_TD_per_Yr"]
train_df[rush_per_yr_cols] = (train_df.loc[:, "Rush_Att":"Rush_TD"]
                                      .div(train_df.Col_Yrs_Played, axis=0)
                                      .fillna(0))

features = ["Rush_Att",
            "Rush_Yds",
            "Rush_TD",
            "Rush_Yds_per_Att",
            "Rush_Att_per_G",
            "Rush_Yds_per_G",
            "Rush_TD_per_G",
            "Rush_Att_per_Yr",
            "Rush_Yds_per_Yr",
            "Rush_TD_per_Yr",
            "Ht",
            "Wt",
            "Forty",
            "Vertical",
            "Bench",
            "Broad Jump",
            "Cone",
            "Shuttle"]

target = "AV"

X = train_df.loc[:, features]
y = train_df.loc[:, target]

# # get a dictionary with the season as the key and the associated
# # row indices as the values
year_grpby = train_df.groupby("Year")
year_grpby.grouper.label_info
logo = LeaveOneGroupOut()
cv = list(logo.split(X, y, year_grpby.grouper.label_info))

In [5]:
cv_predictions = cross_val_predict(model, X, y)

In [6]:
cv_predictions

array([19.78155117, 17.80615864, 15.65127059, 34.56283128, 34.04265026,
       19.59786865, 18.26476762, 35.01079753, 26.45876458, 30.83540949,
       34.7491766 , 19.6285507 , 23.98458821, 29.13051701, 20.43136373,
       27.80202519, 26.49138404, 44.43180449, 20.49128828, 15.28588288,
       20.24172751, 26.16544337, 28.00709943, 37.37875842, 22.48359939,
       30.32641756, 20.5240252 , 19.65857356, 16.6266592 , 20.60072317,
       16.7430104 , 21.75615452, 18.79590954, 15.25029668, 15.79251332,
       32.6176077 , 46.59988567, 37.01514664, 21.1668025 , 21.67748509,
       35.1150577 , 16.43710161, 22.5508128 , 15.79251332, 21.11618708,
       34.24778595, 25.88655261, 43.64564373, 29.38790254, 23.53176116,
       13.58927584, 33.32059211, 46.64399082, 16.74034942, 24.03007959,
       22.81752287, 33.07297703, 30.86126002, 24.66926801, 16.15683494,
       17.29416632, 20.43300986, 22.26327112, 31.29252052,  9.84037609,
       14.7041954 , 33.32668638, 22.12829217, 22.17126681, 19.36

In [7]:
train_df['AV_95_pred'] = cv_predictions

In [8]:
train_df[['Player', 'AV_95_pred']].sort_values('AV_95_pred', ascending=False).head(25)

Unnamed: 0,Player,AV_95_pred
286,Ryan Mathews,65.786186
232,Darren McFadden,61.435311
271,Beanie Wells,58.581268
206,Adrian Peterson,57.916082
238,Ray Rice,56.91937
241,Steve Slaton,55.263904
328,Jordan Todman,55.23486
122,Steven Jackson,53.408167
188,DeAngelo Williams,52.161393
295,Ben Tate,50.534739


In [9]:
(train_df.AV <= train_df.AV_95_pred).sum() / train_df.shape[0]

0.9096385542168675