In [71]:
import pandas as pd
import numpy as np
import os
from statsmodels.tsa.arima_model import _arma_predict_out_of_sample
from sklearn.linear_model import LinearRegression
from statsmodels.tsa.arima_model import ARMA
from statsmodels.tsa.stattools import adfuller

### Importing & Building Raw Data Set

In [20]:
file_lst = os.listdir('./raw_data_nfl_QB/')
for idx,f in enumerate(file_lst):
    if idx == 0:
        data = pd.read_csv('./raw_data_nfl_QB/{}'.format(f))
    else:
        temp_df = pd.read_csv('./raw_data_nfl_QB/{}'.format(f))
        data = pd.concat([data,temp_df],axis = 0)

In [21]:
data.reset_index(inplace=True)

### Renaming

In [22]:
data['Rank_QBR'] = data['Rk']
data['Sack'] = data['Sk']
data['Completions'] = data['Cmp']
data['Attempts'] = data['Att']
data['Passing_Yards'] = data['Yds']
data['Sack_Yards'] = data['Yds.1']
data['QBR'] = data['Rate']
data['Yds_Per_Attempt'] =  data['Y/A']
data['Adj_Yd_Per_Attempt'] = data['AY/A']

### Feature Engineering

Assuming basic scoring for calculation of fantasy points column 

    25 yards = 1 pt
    1 TD = 6 pts
    1 Int = -2 pts

In [23]:
data['Year'] = data['Date'].str.split('-', 1, expand=True)[0]
data['Year'] = data['Year'].astype(str).astype(int)
data['Month'] = data['Date'].str.split('-', 2, expand=True)[1]
data['Month'] = data['Month'].astype(str).astype(int)
data['Name'] = data['Unnamed: 1'].str.split('\\', 1, expand=True)[0]
data['Home_Away'] = (data['Unnamed: 6']=='@').map({False:'Home', True:'Away'})
data['Home_Team'] = (data['Home_Away']=='Home').map({True:1, False:0})
data['Fantasy_Points'] = (data['Passing_Yards']/25)+(data['TD']*6)-(data['Int']*2)

In [24]:
QB_ID = []
for a, b in zip(data['Year'], data['Name']):
    Qid = '{}_{}'.format(a,b)
    QB_ID.append(Qid)
    
data['QB_ID'] = QB_ID

### Dropping Irrelevant & Redundant Columns

In [25]:
## Dropping Columns that have been renamed or are irrelevant('Lg')
drop_cols = ['Rk', 'Sk', 'Cmp', 'Yds', 'Lg', 'Att', 'Yds', 'Yds.1',
             'Rate', 'Y/A', 'AY/A', 'index', 'Unnamed: 1', 'Unnamed: 6']
data.drop(drop_cols, axis=1, inplace=True)

### Trimming Dataframe to Passing Attempts >= 10

In [30]:
## Dropping Attempts Below 5
data = data.loc[data['Attempts']>=10]

In [31]:
data.columns

Index(['Age', 'Date', 'Tm', 'Opp', 'Result', 'G#', 'Week', 'Day', 'Cmp%', 'TD',
       'Int', 'Rank_QBR', 'Sack', 'Completions', 'Attempts', 'Passing_Yards',
       'Sack_Yards', 'QBR', 'Yds_Per_Attempt', 'Adj_Yd_Per_Attempt', 'Year',
       'Month', 'Name', 'Home_Away', 'Home_Team', 'Fantasy_Points', 'QB_ID'],
      dtype='object')

### Check for Stationarity

In [32]:
results_Yds = adfuller(data['Passing_Yards'].values)
results_TD = adfuller(data['TD'].values)
results_Int = adfuller(data['Int'].values)

In [33]:
print('Yards Dickey Fuller Test')
print(results_Yds[0], 'T-Statistic')
print(results_Yds[1], 'P_value')
print(results_Yds[4], 'Critical Values')
print('TD Dickey Fuller Test')
print(results_TD[0], 'T-Statistic')
print(results_TD[1], 'P_value')
print(results_TD[4], 'Critical Values')
print('Int Dickey Fuller Test')
print(results_Int[0], 'T-Statistic')
print(results_Int[1], 'P_value')
print(results_Int[4], 'Critical Values')

Yards Dickey Fuller Test
-9.9578607912 T-Statistic
2.41723576211e-17 P_value
{'1%': -3.4329527780962255, '5%': -2.8626898965523724, '10%': -2.567382133955709} Critical Values
TD Dickey Fuller Test
-11.2758637549 T-Statistic
1.49719912924e-20 P_value
{'1%': -3.4329527780962255, '5%': -2.8626898965523724, '10%': -2.567382133955709} Critical Values
Int Dickey Fuller Test
-11.7313487594 T-Statistic
1.33935739303e-21 P_value
{'1%': -3.4329527780962255, '5%': -2.8626898965523724, '10%': -2.567382133955709} Critical Values


##### shows variables are stationary

### Setting Functions & Models For Fantasy Relevant Statistics

#### Yards

In [72]:
X = data[['Cmp%', 'TD', 'Int', 'Sack', 'Completions', 'Attempts','Sack_Yards', 'QBR']]
y = data['Passing_Yards']
lr_yards = LinearRegression()
lr_yards.fit(X,y)
predict_yards = lr_yards.predict(X)
residuals = predict_yards - y.values
ar_yards = ARMA(residuals, (2,1)).fit()

In [73]:
def yard_projections(mstr_df,qb_id,week,lr_yards,ar_yards):
    '''
    mstr_df: Pandas DF; master df with all qbs in all years and all weeks
    qb_id: string; the id of the QB you want to make a prediction
    week: int, predicting week
    lr_yards: linear regression model for yards
    ar_yards: ARMA model object for yards residuals
    
    '''
    ycol = ['Passing_Yards'] ## Target Variable
    Xcols = ['Cmp%', 'TD', 'Int', 'Sack', 'Completions', 'Attempts','Sack_Yards', 'QBR'] ## predictors
    ## create temp df for each QB by Year and week
    ## then reset index
    temp_df = mstr_df[mstr_df['QB_ID'] == qb_id]
    temp_df = temp_df.sort_values('G#')
    temp_df.reset_index(inplace=True)
    X = temp_df[Xcols]
    y = temp_df[ycol]
    ## grabbing the weeks before the week specified
    w = week-1
    prev_weeks = temp_df[:w][Xcols]
    y_pred = lr_yards.predict(prev_weeks)
    resid = np.array(y[ycol][:w]).flatten()-y_pred
    dates = temp_df.index
    params = ar_yards.params
    residuals = ar_yards.resid
    p = ar_yards.k_ar
    q = ar_yards.k_ma
    k_exog = ar_yards.k_exog
    k_trend = ar_yards.k_trend
    steps = 1

    
    oos_predictions_yards_resid = _arma_predict_out_of_sample(params, steps, residuals, 
                                    p, q, k_trend, k_exog, 
                                    endog=resid, exog=None, start=w+1)
    
    pred_yds = oos_predictions_yards_resid + (y_pred.sum()/week)
    return pred_yds

#### Touchdowns

In [74]:
X = data[['Cmp%', 'Passing_Yards', 'Int', 'Sack', 'Completions', 'Attempts','Sack_Yards', 'QBR']]
y = data['TD']
lr_TD = LinearRegression()
lr_TD.fit(X,y)
predict_TD = lr_TD.predict(X)
residuals = predict_TD - y.values
ar_TD = ARMA(residuals, (2,1)).fit()

In [75]:
def TD_projections(mstr_df,qb_id,week,lr_TD,ar_TD):
    '''
    mstr_df: Pandas DF; master df with all qbs in all years and all weeks
    qb_id: string; the id of the qq you want to make a prediction
    week: int, predicting week
    lr_TD: linear regression model for touchdowns
    ar_TD: ARMA model object for touchdowns residuals
    
    '''
    ycol = ['TD'] ## Target Variable
    Xcols = ['Cmp%', 'Passing_Yards', 'Int', 'Sack', 'Completions', 'Attempts','Sack_Yards', 'QBR'] ## predictors
    ## create temp df for each QB by Year and week
    ## then reset index
    temp_df = mstr_df[mstr_df['QB_ID'] == qb_id]
    temp_df = temp_df.sort_values('G#')
    temp_df.reset_index(inplace=True)
    X = temp_df[Xcols]
    y = temp_df[ycol]
    ## grabbing the weeks before the week specified
    w = week-1
    prev_weeks = temp_df[:w][Xcols]
    y_pred = lr_TD.predict(prev_weeks)
    resid = np.array(y[ycol][:w]).flatten()-y_pred
    dates = temp_df.index
    params = ar_TD.params
    residuals = ar_TD.resid
    p = ar_TD.k_ar
    q = ar_TD.k_ma
    k_exog = ar_TD.k_exog
    k_trend = ar_TD.k_trend
    steps = 1
    
    oos_predictions_TD_resid = _arma_predict_out_of_sample(params, steps, residuals, 
                                    p, q, k_trend, k_exog, 
                                    endog=resid, exog=None, start=w+1)
    
    pred_TD = oos_predictions_TD_resid + (y_pred.sum()/week)
    return pred_TD

#### Interceptions

In [76]:
X = data[['Cmp%', 'Passing_Yards', 'TD', 'Sack', 'Completions', 'Attempts','Sack_Yards', 'QBR']]
y = data['Int']
lr_Int = LinearRegression()
lr_Int.fit(X,y)
predict_Int = lr_Int.predict(X)
residuals = predict_Int - y.values
ar_Int = ARMA(residuals, (2,1)).fit()

In [77]:
def Int_projections(mstr_df,qb_id,week,lr_Int,ar_Int):
    '''
    mstr_df: Pandas DF; master df with all qbs in all years and all weeks
    qb_id: string; the id of the qq you want to make a prediction
    week: int, predicting week
    lr_Int: linear regression model for interceptions
    ar_Int: ARMA model object for interceptions residuals
    
    '''
    ycol = ['Int'] ## Target Variable
    Xcols = ['Cmp%', 'Passing_Yards', 'TD', 'Sack', 'Completions', 'Attempts','Sack_Yards', 'QBR'] ## predictors
    ## create temp df for each QB by Year and week
    ## then reset index
    temp_df = mstr_df[mstr_df['QB_ID'] == qb_id]
    temp_df = temp_df.sort_values('G#')
    temp_df.reset_index(inplace=True)
    X = temp_df[Xcols]
    y = temp_df[ycol]
    ## grabbing the weeks before the week specified
    w = week-1
    prev_weeks = temp_df[:w][Xcols]
    y_pred = lr_Int.predict(prev_weeks)
    resid = np.array(y[ycol][:w]).flatten()-y_pred
    dates = temp_df.index
    params = ar_Int.params
    residuals = ar_Int.resid
    p = ar_Int.k_ar
    q = ar_Int.k_ma
    k_exog = ar_Int.k_exog
    k_trend = ar_Int.k_trend
    steps = 1

    oos_predictions_Int_resid = _arma_predict_out_of_sample(params, steps, residuals, 
                                    p, q, k_trend, k_exog, 
                                    endog=resid, exog=None, start=w+1)

    pred_Int = oos_predictions_Int_resid + (y_pred.sum()/week)
    return pred_Int

### Calculating Fantasy Point Projections

#### Function 

In [78]:
def Fantasy_Points_Calculator(QB_ID, week):
    Fantasy_point_prediction = (((yard_projections(data, QB_ID, week, lr_yards, ar_yards))/25)
    +((TD_projections(data, QB_ID, week, lr_TD, ar_TD))*6)
    -((Int_projections(data, QB_ID, week, lr_Int, ar_Int))*2))
    return Fantasy_point_prediction

In [79]:
## Break Down By Week
Week1_df = data.loc[data['Week']==1]
Week2_df = data.loc[data['Week']==2]
Week3_df = data.loc[data['Week']==3]
Week4_df = data.loc[data['Week']==4]
Week5_df = data.loc[data['Week']==5]
Week6_df = data.loc[data['Week']==6]
Week7_df = data.loc[data['Week']==7]
Week8_df = data.loc[data['Week']==8]
Week9_df = data.loc[data['Week']==9]
Week10_df = data.loc[data['Week']==10]
Week11_df = data.loc[data['Week']==11]
Week12_df = data.loc[data['Week']==12]
Week13_df = data.loc[data['Week']==13]
Week14_df = data.loc[data['Week']==14]
Week15_df = data.loc[data['Week']==15]
Week16_df = data.loc[data['Week']==16]
Week17_df = data.loc[data['Week']==17]

## Break Down By Season

df_2016 = data.loc[data['Year']==2016]
df_2015 = data.loc[data['Year']==2015]
df_2014 = data.loc[data['Year']==2014]
df_2013 = data.loc[data['Year']==2013]

In [80]:
qbs_2017_14 = Week14_df.loc[Week14_df['QB_ID'].str.split('_', 1, expand=True)[0]=='2017']
qbs_2017_13 = Week13_df.loc[Week13_df['QB_ID'].str.split('_', 1, expand=True)[0]=='2017']
qbs_2017_12 = Week12_df.loc[Week12_df['QB_ID'].str.split('_', 1, expand=True)[0]=='2017']
qbs_2017_11 = Week11_df.loc[Week11_df['QB_ID'].str.split('_', 1, expand=True)[0]=='2017']

In [81]:
qbs14 = np.array(qbs_2017_14['QB_ID'])
qbs13 = np.array(qbs_2017_13['QB_ID'])
qbs12 = np.array(qbs_2017_12['QB_ID'])
qbs11 = np.array(qbs_2017_11['QB_ID'])

In [82]:
qbs_1merge = list(set(qbs14).intersection(qbs13))
qbs_2merge = list(set(qbs_1merge).intersection(qbs12))
qbs = list(set(qbs_2merge).intersection(qbs11))

In [83]:
qbs_working= ['2017_Tom Brady', '2017_Blake Bortles', 
              '2017_Alex Smith', '2017_Russell Wilson', '2017_Drew Brees', 
              '2017_Joe Flacco', '2017_Philip Rivers', 
              '2017_Matthew Stafford', '2017_Kirk Cousins', '2017_Jared Goff']

In [84]:
week_15_projections = []
for i in qbs_working:
    projections = Fantasy_Points_Calculator(i, 15)[0]
    week_15_projections.append(projections)

In [85]:
proj_df = pd.DataFrame(week_15_projections)
proj_df['Projections'] = proj_df[0]

In [86]:
proj_df.drop([0], axis=1, inplace=True)

In [87]:
proj_df['QB'] = qbs_working
proj_df

Unnamed: 0,Projections,QB
0,20.953711,2017_Tom Brady
1,10.881225,2017_Blake Bortles
2,18.072424,2017_Alex Smith
3,15.081776,2017_Russell Wilson
4,16.215238,2017_Drew Brees
5,8.745236,2017_Joe Flacco
6,18.134847,2017_Philip Rivers
7,16.840742,2017_Matthew Stafford
8,15.741082,2017_Kirk Cousins
9,16.840403,2017_Jared Goff


In [88]:
actual_fantasy_pts = [33.92, 31.04, 19.24, 15.68, 19.40, 21.52, 9.08, 21.48, 19.84, 14.80]


proj_df['actual_pts'] = actual_fantasy_pts
proj_df['se'] = (proj_df['Projections'] - proj_df['actual_pts'])**2
rmse = np.sqrt(proj_df['se'].mean())
rmse

9.3489926493359796