In [1]:
import pandas as pd
import numpy as np
import os
from datetime import datetime
from sklearn.linear_model import LinearRegression
from statsmodels.tsa.arima_model import ARMA, AR, ARIMA
from statsmodels.tsa.stattools import adfuller

  from pandas.core import datetools


### Importing & Building Raw Data Set

In [2]:
file_lst = os.listdir('./raw_data_nfl_QB/')
for idx,f in enumerate(file_lst):
    if idx == 0:
        data = pd.read_csv('./raw_data_nfl_QB/{}'.format(f))
    else:
        temp_df = pd.read_csv('./raw_data_nfl_QB/{}'.format(f))
        data = pd.concat([data,temp_df],axis = 0)

In [3]:
data.reset_index(inplace=True)

### Renaming

In [4]:
data['Rank_QBR'] = data['Rk']
data['Sack'] = data['Sk']
data['Completions'] = data['Cmp']
data['Attempts'] = data['Att']
data['Passing_Yards'] = data['Yds']
data['Sack_Yards'] = data['Yds.1']
data['QBR'] = data['Rate']
data['Yds_Per_Attempt'] =  data['Y/A']
data['Adj_Yd_Per_Attempt'] = data['AY/A']

### Feature Engineering

In [5]:
data['Year'] = data['Date'].str.split('-', 1, expand=True)[0]
data['Year'] = data['Year'].astype(str).astype(int)
data['Month'] = data['Date'].str.split('-', 2, expand=True)[1]
data['Month'] = data['Month'].astype(str).astype(int)
data['Name'] = data['Unnamed: 1'].str.split('\\', 1, expand=True)[0]
data['Home_Away'] = (data['Unnamed: 6']=='@').map({False:'Home', True:'Away'})
data['Home_Team'] = (data['Home_Away']=='Home').map({True:1, False:0})

In [6]:
QB_ID = []
for a, b in zip(data['Year'], data['Name']):
    Qid = '{}_{}'.format(a,b)
    QB_ID.append(Qid)
    
data['QB_ID'] = QB_ID

### Dropping Irrelevant & Redundant Columns

In [7]:
## Dropping Columns that have been renamed or are irrelevant('Lg')
drop_cols = ['Rk', 'Sk', 'Cmp', 'Yds', 'Lg', 'Att', 'Yds', 'Yds.1',
             'Rate', 'Y/A', 'AY/A', 'index', 'Unnamed: 1', 'Unnamed: 6']
data.drop(drop_cols, axis=1, inplace=True)

### Trimming Dataframe to Passing Attempts >= 5

In [8]:
## Dropping Attempts Below 5
data = data.loc[data['Attempts']>=5]

In [9]:
data.columns

Index(['Age', 'Date', 'Tm', 'Opp', 'Result', 'G#', 'Week', 'Day', 'Cmp%', 'TD',
       'Int', 'Rank_QBR', 'Sack', 'Completions', 'Attempts', 'Passing_Yards',
       'Sack_Yards', 'QBR', 'Yds_Per_Attempt', 'Adj_Yd_Per_Attempt', 'Year',
       'Month', 'Name', 'Home_Away', 'Home_Team', 'QB_ID'],
      dtype='object')

### Check for Stationarity

In [10]:
results_Yds = adfuller(data['Passing_Yards'].values)
results_TD = adfuller(data['TD'].values)
results_Int = adfuller(data['Int'].values)

In [11]:
print('Yards Dickey Fuller Test')
print(results_Yds[0], 'T-Statistic')
print(results_Yds[1], 'P_value')
print(results_Yds[4], 'Critical Values')
print('TD Dickey Fuller Test')
print(results_TD[0], 'T-Statistic')
print(results_TD[1], 'P_value')
print(results_TD[4], 'Critical Values')
print('Int Dickey Fuller Test')
print(results_Int[0], 'T-Statistic')
print(results_Int[1], 'P_value')
print(results_Int[4], 'Critical Values')

Yards Dickey Fuller Test
-13.7318326603 T-Statistic
1.13632329758e-25 P_value
{'1%': -3.4333601308010926, '5%': -2.8628697724738688, '10%': -2.5674779041352886} Critical Values
TD Dickey Fuller Test
-14.8315014124 T-Statistic
1.89618765822e-27 P_value
{'1%': -3.4333684674663467, '5%': -2.8628734534635902, '10%': -2.5674798640234444} Critical Values
Int Dickey Fuller Test
-15.1702199755 T-Statistic
6.40644513972e-28 P_value
{'1%': -3.4333670748168506, '5%': -2.8628728385507474, '10%': -2.567479536622141} Critical Values


In [15]:
### shows variables are stationary

In [12]:
## Break Down By Week
Week1_df = data.loc[data['Week']==1]
Week2_df = data.loc[data['Week']==2]
Week3_df = data.loc[data['Week']==3]
Week4_df = data.loc[data['Week']==4]
Week5_df = data.loc[data['Week']==5]
Week6_df = data.loc[data['Week']==6]
Week7_df = data.loc[data['Week']==7]
Week8_df = data.loc[data['Week']==8]
Week9_df = data.loc[data['Week']==9]
Week10_df = data.loc[data['Week']==10]
Week11_df = data.loc[data['Week']==11]
Week12_df = data.loc[data['Week']==12]
Week13_df = data.loc[data['Week']==13]
Week14_df = data.loc[data['Week']==14]
Week15_df = data.loc[data['Week']==15]
Week16_df = data.loc[data['Week']==16]
Week17_df = data.loc[data['Week']==17]

In [13]:
## Break Down By Season
df_2016 = data.loc[data['Year']==2016]
df_2015 = data.loc[data['Year']==2015]
df_2014 = data.loc[data['Year']==2014]
df_2013 = data.loc[data['Year']==2013]

convert date to date time object for time series set as index 

###drop low completion numbers

###aggregate by weeks over years

###label qbs by year

####check for stationary if not turn into stationary

    stats models package
    
    check lecture

In [14]:
X = data[['Cmp%', 'TD', 'Int', 'Sack', 'Completions', 'Attempts','Sack_Yards', 'QBR']]
y = data['Passing_Yards']
lr_yards = LinearRegression()
lr_yards.fit(X,y)

predict_yards = lr_yards.predict(X)

In [15]:
lr_yards.coef_

array([ -4.11355449, -15.30438064,  34.30612255,  -0.71574486,
        10.57206503,  -0.26594138,   0.13497222,   2.8726782 ])

In [16]:
residuals = predict_yards - y.values

ar_yards = ARMA(residuals, (2,1)).fit()
params = ar_yards.params

In [183]:
df = df_2014.loc[df_2014['Week']==1]

In [188]:
wk = [78.26,20,3,4,18,23,31,127.9]

In [226]:
Xs = df[['Cmp%', 'TD', 'Int', 'Sack', 'Completions', 'Attempts','Sack_Yards', 'QBR']]
y = df['Passing_Yards']

In [316]:
y = ['Passing_Yards']
X = ['Cmp%', 'TD', 'Int', 'Sack', 'Completions', 'Attempts','Sack_Yards', 'QBR']
peyton_13 = df_2013[df_2013['QB_ID'] == '2013_Peyton Manning']
X_peyton_13 = peyton_13[X]
y_peyton_13 = peyton_13[y]

list of every unique qb id


In [289]:
peyton_13 = peyton_13.sort_values('G#')

peyton_13.reset_index(inplace=True)

In [313]:
week = 3
w = week-1
train = peyton_13[:w][X]
train

Unnamed: 0,Cmp%,TD,Int,Sack,Completions,Attempts,Sack_Yards,QBR
0,64.29,7,0,3,27,42,17,141.1
1,69.77,2,0,0,30,43,0,105.5


In [348]:
y_pred_peyton =  lr_yards.predict(train)

In [349]:
y_pred_peyton

array([ 332.25375778,  315.26899881])

In [355]:
np.array(y_peyton_13['Passing_Yards'][:w] - y_pred_peyton)

array([ 129.74624222,   14.73100119])

In [339]:
y_peyton_13[:w].values.reshape(1,2).flatten()

array([462, 330], dtype=int64)

In [343]:
resid = y_pred_peyton - y_peyton_13[:w].values

In [None]:
ar_yards = ARMA(residuals, (2,1), freq=).fit()

In [492]:
def yard_projections(mstr_df,qb_id,week,lr_yards,ar_yards):
    '''
    mstr_df: Pandas DF; master df with all qbs in all years and all weeks
    qb_id: string; the id of the qq you want to make a prediction
    week: int, predicting week
    lr_yards: linear regression model for yards
    ar_yards: ArMA model object for yard residuals
    
    '''
    ycol = ['Passing_Yards'] ## Target Variable
    Xcols = ['Cmp%', 'TD', 'Int', 'Sack', 'Completions', 'Attempts','Sack_Yards', 'QBR'] ## predictors
    ## create temp df for each QB by Year and week
    ## then reset index
    temp_df = mstr_df[mstr_df['QB_ID'] == qb_id]
    temp_df = temp_df.sort_values('G#')
    temp_df.reset_index(inplace=True)
    X = temp_df[Xcols]
    y = temp_df[ycol]
    ## grabbing the weeks before the week specified
    w = week-1
    prev_weeks = temp_df[:w][Xcols]
    y_pred = lr_yards.predict(prev_weeks)
    resid = np.array(y[ycol][:w]).flatten()-y_pred
    print(type(resid))
    dates
    ar_yards.predict(resid, start=temp_df.index[w], end=temp_df.index[w+1])

## Predict week 4 for Peyton

In [494]:
yard_projections(data, '2014_Peyton Manning', 4, lr_yards, ar_yards)

<class 'numpy.ndarray'>


TypeError: predict() got multiple values for argument 'start'

In [242]:
p_13_hat = lr_yards.predict(peyton_13)

In [243]:
p_13_hat

array([ 332.25375778,  321.20055572,  275.37970925,  205.20029177,
        320.42678748,  397.25766565,  297.78777995,  354.3627831 ,
        271.76038609,  315.26899881,  340.52211455,  330.59862009,
        383.98076174,  305.25320622,  338.63383161,  362.27231315])

In [252]:
peyton_13.head()

Unnamed: 0,Cmp%,TD,Int,Sack,Completions,Attempts,Sack_Yards,QBR
0,64.29,7,0,3,27,42,17,141.1
37,69.44,4,0,2,25,36,17,135.2
78,60.0,1,0,0,24,40,0,94.1
127,52.78,2,1,2,19,36,18,70.4
141,62.86,5,2,0,22,35,0,118.2


In [None]:
ar_yards()