In [35]:
import pandas as pd
import numpy as np
import os
from sklearn.linear_model import LinearRegression
from statsmodels.tsa.arima_model import ARMA
from statsmodels.tsa.stattools import adfuller

### Importing & Building Raw Data Set

In [36]:
file_lst = os.listdir('./raw_data_nfl_QB/')
for idx,f in enumerate(file_lst):
    if idx == 0:
        data = pd.read_csv('./raw_data_nfl_QB/{}'.format(f))
    else:
        temp_df = pd.read_csv('./raw_data_nfl_QB/{}'.format(f))
        data = pd.concat([data,temp_df],axis = 0)

In [37]:
data.reset_index(inplace=True)

### Renaming

In [38]:
data['Rank_QBR'] = data['Rk']
data['Sack'] = data['Sk']
data['Completions'] = data['Cmp']
data['Attempts'] = data['Att']
data['Passing_Yards'] = data['Yds']
data['Sack_Yards'] = data['Yds.1']
data['QBR'] = data['Rate']
data['Yds_Per_Attempt'] =  data['Y/A']
data['Adj_Yd_Per_Attempt'] = data['AY/A']

### Feature Engineering

Assuming basic scoring for calculation of fantasy points column 

    25 yards = 1 pt
    1 TD = 6 pts
    1 Int = -2 pts

In [39]:
data['Year'] = data['Date'].str.split('-', 1, expand=True)[0]
data['Year'] = data['Year'].astype(str).astype(int)
data['Month'] = data['Date'].str.split('-', 2, expand=True)[1]
data['Month'] = data['Month'].astype(str).astype(int)
data['Name'] = data['Unnamed: 1'].str.split('\\', 1, expand=True)[0]
data['Home_Away'] = (data['Unnamed: 6']=='@').map({False:'Home', True:'Away'})
data['Home_Team'] = (data['Home_Away']=='Home').map({True:1, False:0})
data['Fantasy_Points'] = (data['Passing_Yards']/25)+(data['TD']*6)-(data['Int']*2)

In [40]:
QB_ID = []
for a, b in zip(data['Year'], data['Name']):
    Qid = '{}_{}'.format(a,b)
    QB_ID.append(Qid)
    
data['QB_ID'] = QB_ID

### Dropping Irrelevant & Redundant Columns

In [41]:
## Dropping Columns that have been renamed or are irrelevant('Lg')
drop_cols = ['Rk', 'Sk', 'Cmp', 'Yds', 'Lg', 'Att', 'Yds', 'Yds.1',
             'Rate', 'Y/A', 'AY/A', 'index', 'Unnamed: 1', 'Unnamed: 6']
data.drop(drop_cols, axis=1, inplace=True)

### Trimming Dataframe to Passing Attempts >= 5

In [42]:
## Dropping Attempts Below 5
data = data.loc[data['Attempts']>=5]

In [43]:
data.columns

Index(['Age', 'Date', 'Tm', 'Opp', 'Result', 'G#', 'Week', 'Day', 'Cmp%', 'TD',
       'Int', 'Rank_QBR', 'Sack', 'Completions', 'Attempts', 'Passing_Yards',
       'Sack_Yards', 'QBR', 'Yds_Per_Attempt', 'Adj_Yd_Per_Attempt', 'Year',
       'Month', 'Name', 'Home_Away', 'Home_Team', 'Fantasy_Points', 'QB_ID'],
      dtype='object')

### Check for Stationarity

In [44]:
results_Yds = adfuller(data['Passing_Yards'].values)
results_TD = adfuller(data['TD'].values)
results_Int = adfuller(data['Int'].values)

In [45]:
print('Yards Dickey Fuller Test')
print(results_Yds[0], 'T-Statistic')
print(results_Yds[1], 'P_value')
print(results_Yds[4], 'Critical Values')
print('TD Dickey Fuller Test')
print(results_TD[0], 'T-Statistic')
print(results_TD[1], 'P_value')
print(results_TD[4], 'Critical Values')
print('Int Dickey Fuller Test')
print(results_Int[0], 'T-Statistic')
print(results_Int[1], 'P_value')
print(results_Int[4], 'Critical Values')

Yards Dickey Fuller Test
-11.7174958949 T-Statistic
1.43953283153e-21 P_value
{'1%': -3.4328579475395395, '5%': -2.8626480184291183, '10%': -2.5673598376418432} Critical Values
TD Dickey Fuller Test
-11.4002325494 T-Statistic
7.68020883773e-21 P_value
{'1%': -3.4328598727101833, '5%': -2.8626488686169682, '10%': -2.5673602902877883} Critical Values
Int Dickey Fuller Test
-16.849530851 T-Statistic
1.12153527086e-29 P_value
{'1%': -3.4328541060447755, '5%': -2.8626463219585556, '10%': -2.5673589344293486} Critical Values


In [46]:
### shows variables are stationary

### Setting Functions & Models For Fantasy Relevant Statistics

#### Yards

In [47]:
X = data[['Cmp%', 'TD', 'Int', 'Sack', 'Completions', 'Attempts','Sack_Yards', 'QBR']]
y = data['Passing_Yards']
lr_yards = LinearRegression()
lr_yards.fit(X,y)
predict_yards = lr_yards.predict(X)
residuals = predict_yards - y.values
ar_yards = ARMA(residuals, (4,1)).fit()

In [48]:
def yard_projections(mstr_df,qb_id,week,lr_yards,ar_yards):
    '''
    mstr_df: Pandas DF; master df with all qbs in all years and all weeks
    qb_id: string; the id of the QB you want to make a prediction
    week: int, predicting week
    lr_yards: linear regression model for yards
    ar_yards: ARMA model object for yards residuals
    
    '''
    ycol = ['Passing_Yards'] ## Target Variable
    Xcols = ['Cmp%', 'TD', 'Int', 'Sack', 'Completions', 'Attempts','Sack_Yards', 'QBR'] ## predictors
    ## create temp df for each QB by Year and week
    ## then reset index
    temp_df = mstr_df[mstr_df['QB_ID'] == qb_id]
    temp_df = temp_df.sort_values('G#')
    temp_df.reset_index(inplace=True)
    X = temp_df[Xcols]
    y = temp_df[ycol]
    ## grabbing the weeks before the week specified
    w = week-1
    prev_weeks = temp_df[:w][Xcols]
    y_pred = lr_yards.predict(prev_weeks)
    resid = np.array(y[ycol][:w]).flatten()-y_pred
    dates = temp_df.index
    start = dates[w]
    end = dates[w]
    residual_prediction = ar_yards.predict(start=start, end=end)
    pred_yds = residual_prediction + (y_pred.sum()/week)
    return pred_yds

#### Touchdowns

In [49]:
X = data[['Cmp%', 'Passing_Yards', 'Int', 'Sack', 'Completions', 'Attempts','Sack_Yards', 'QBR']]
y = data['TD']
lr_TD = LinearRegression()
lr_TD.fit(X,y)
predict_TD = lr_TD.predict(X)
residuals = predict_TD - y.values
ar_TD = ARMA(residuals, (4,1)).fit()

In [50]:
def TD_projections(mstr_df,qb_id,week,lr_TD,ar_TD):
    '''
    mstr_df: Pandas DF; master df with all qbs in all years and all weeks
    qb_id: string; the id of the qq you want to make a prediction
    week: int, predicting week
    lr_TD: linear regression model for touchdowns
    ar_TD: ARMA model object for touchdowns residuals
    
    '''
    ycol = ['TD'] ## Target Variable
    Xcols = ['Cmp%', 'Passing_Yards', 'Int', 'Sack', 'Completions', 'Attempts','Sack_Yards', 'QBR'] ## predictors
    ## create temp df for each QB by Year and week
    ## then reset index
    temp_df = mstr_df[mstr_df['QB_ID'] == qb_id]
    temp_df = temp_df.sort_values('G#')
    temp_df.reset_index(inplace=True)
    X = temp_df[Xcols]
    y = temp_df[ycol]
    ## grabbing the weeks before the week specified
    w = week-1
    prev_weeks = temp_df[:w][Xcols]
    y_pred = lr_TD.predict(prev_weeks)
    resid = np.array(y[ycol][:w]).flatten()-y_pred
    dates = temp_df.index
    start = dates[w]
    end = dates[w]
    residual_prediction = ar_TD.predict(start=start, end=end)
    pred_TDs = residual_prediction + (y_pred.sum()/week)
    return pred_TDs

#### Interceptions

In [51]:
X = data[['Cmp%', 'Passing_Yards', 'TD', 'Sack', 'Completions', 'Attempts','Sack_Yards', 'QBR']]
y = data['Int']
lr_Int = LinearRegression()
lr_Int.fit(X,y)
predict_Int = lr_Int.predict(X)
residuals = predict_Int - y.values
ar_Int = ARMA(residuals, (4,1)).fit()

In [52]:
def Int_projections(mstr_df,qb_id,week,lr_Int,ar_Int):
    '''
    mstr_df: Pandas DF; master df with all qbs in all years and all weeks
    qb_id: string; the id of the qq you want to make a prediction
    week: int, predicting week
    lr_Int: linear regression model for interceptions
    ar_Int: ARMA model object for interceptions residuals
    
    '''
    ycol = ['Int'] ## Target Variable
    Xcols = ['Cmp%', 'Passing_Yards', 'TD', 'Sack', 'Completions', 'Attempts','Sack_Yards', 'QBR'] ## predictors
    ## create temp df for each QB by Year and week
    ## then reset index
    temp_df = mstr_df[mstr_df['QB_ID'] == qb_id]
    temp_df = temp_df.sort_values('G#')
    temp_df.reset_index(inplace=True)
    X = temp_df[Xcols]
    y = temp_df[ycol]
    ## grabbing the weeks before the week specified
    w = week-1
    prev_weeks = temp_df[:w][Xcols]
    y_pred = lr_Int.predict(prev_weeks)
    resid = np.array(y[ycol][:w]).flatten()-y_pred
    dates = temp_df.index
    start = dates[w]
    end = dates[w]
    residual_prediction = ar_Int.predict(start=start, end=end)
    pred_Ints = residual_prediction + (y_pred.sum()/week)
    return pred_Ints

### Calculating Fantasy Point Projections

#### Function 

In [53]:
def Fantasy_Points_Calculator(QB_ID, week):
    Fantasy_point_prediction = (((yard_projections(data, QB_ID, week, lr_yards, ar_yards))/25)
    +((TD_projections(data, QB_ID, week, lr_TD, ar_TD))*6)
    -((Int_projections(data, QB_ID, week, lr_Int, ar_Int))*2))
    return Fantasy_point_prediction

In [54]:
Fantasy_Points_Calculator(, 12)

array([ 25.43340525])

In [33]:
y_peyton_13['Passing_Yards'][:w]

NameError: name 'y_peyton_13' is not defined

In [348]:
y_pred_peyton =  lr_yards.predict(train)

In [355]:
np.array(y_peyton_13['Passing_Yards'][:w] - y_pred_peyton)

array([ 129.74624222,   14.73100119])

In [339]:
y_peyton_13[:w].values.reshape(1,2).flatten()

array([462, 330], dtype=int64)

In [343]:
resid = y_pred_peyton - y_peyton_13[:w].values

In [None]:
ar_yards = ARMA(residuals, (2,1), freq=).fit()

In [242]:
p_13_hat = lr_yards.predict(peyton_13)

In [243]:
p_13_hat

array([ 332.25375778,  321.20055572,  275.37970925,  205.20029177,
        320.42678748,  397.25766565,  297.78777995,  354.3627831 ,
        271.76038609,  315.26899881,  340.52211455,  330.59862009,
        383.98076174,  305.25320622,  338.63383161,  362.27231315])

In [None]:
## Break Down By Week
Week1_df = data.loc[data['Week']==1]
Week2_df = data.loc[data['Week']==2]
Week3_df = data.loc[data['Week']==3]
Week4_df = data.loc[data['Week']==4]
Week5_df = data.loc[data['Week']==5]
Week6_df = data.loc[data['Week']==6]
Week7_df = data.loc[data['Week']==7]
Week8_df = data.loc[data['Week']==8]
Week9_df = data.loc[data['Week']==9]
Week10_df = data.loc[data['Week']==10]
Week11_df = data.loc[data['Week']==11]
Week12_df = data.loc[data['Week']==12]
Week13_df = data.loc[data['Week']==13]
Week14_df = data.loc[data['Week']==14]
Week15_df = data.loc[data['Week']==15]
Week16_df = data.loc[data['Week']==16]
Week17_df = data.loc[data['Week']==17]

## Break Down By Season
df_2016 = data.loc[data['Year']==2016]
df_2015 = data.loc[data['Year']==2015]
df_2014 = data.loc[data['Year']==2014]
df_2013 = data.loc[data['Year']==2013]

In [316]:
y = ['Passing_Yards']
X = ['Cmp%', 'TD', 'Int', 'Sack', 'Completions', 'Attempts','Sack_Yards', 'QBR']
peyton_13 = df_2013[df_2013['QB_ID'] == '2013_Peyton Manning']
X_peyton_13 = peyton_13[X]
y_peyton_13 = peyton_13[y]

peyton_13 = peyton_13.sort_values('G#')

peyton_13.reset_index(inplace=True)

week = 3
w = week-1
train = peyton_13[:w][X]
train

list of every qb by season