In [1]:
'''
Setting out to answer the question, is CPOE a more sticky stat for
receivers or for quarterbacks using nflfastr data. For the years 2016,2017,2018,2019, we're gonna
look at a players CPOE for that year and the previous year.  We will compare the correlation for 
quarterback year to year CPOE and receiver year to year CPOE.  
'''

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', 300)

In [2]:
first_year = 2016
last_year = 2019

In [3]:
YEARS = range(first_year,last_year + 1)
data = pd.DataFrame()

for i in YEARS:  
    i_data = pd.read_csv('https://github.com/guga31bb/nflfastR-data/blob/master/data/' \
                         'play_by_play_' + str(i) + '.csv.gz?raw=True',
                         compression='gzip', low_memory=False)

    data = data.append(i_data, sort=True)
data.reset_index(drop=True, inplace=True)
data.drop(['passer_player_name', 'passer_player_id',
           'rusher_player_name', 'rusher_player_id',
           'receiver_player_name', 'receiver_player_id'],
          axis=1, inplace=True)
data = data.loc[data.season_type=='REG']
data = data.loc[(data.play_type.isin(['no_play','pass','run'])) & (data.epa.isna()==False)]
pd.options.mode.chained_assignment = None
data['play_type'].loc[data['pass']==1] = 'pass'
data['play_type'].loc[data['rush']==1] = 'run'


In [4]:
qbs = (data.groupby(['passer','posteam','season'], as_index=False)
       .agg({'play_id':'count','cpoe':'mean'}))
qbs = qbs.loc[qbs.play_id > 200]
qbs['prev_cpoe'] = qbs['cpoe'].shift(1)
qbs['years_of_data'] = qbs.groupby('passer').cumcount()
qbs = qbs.loc[qbs.years_of_data != 0]

qbs['cpoe'].corr(qbs['prev_cpoe'])

0.38452848209058554

In [5]:
wrs = (data.groupby(['receiver','posteam','season'], as_index=False)
       .agg({'play_id':'count','cpoe':'mean'}))
wrs = wrs.loc[wrs.play_id > 50]
wrs['prev_cpoe'] = wrs['cpoe'].shift(1)
wrs['years_of_data'] = wrs.groupby('receiver').cumcount()
wrs = wrs.loc[wrs.years_of_data != 0]
wrs['cpoe'].corr(wrs['prev_cpoe'])

0.2838608034055531

In [6]:
'''
Okay.  Cool. So it looks like CPOE is more correlated to qb play than to receiver play.

But what about for the better guys? Like what if we take guys that are in the top half of cpoe.
'''
n = qbs.shape[0]

In [10]:
num_of_qbs = qbs.shape[0]
num_of_wrs = wrs.shape[0]
qbs.sort_values(by='prev_cpoe',ascending=False,inplace=True)
wrs.sort_values(by='prev_cpoe',ascending=False,inplace=True)
topqbs = qbs.drop(qbs.tail(num_of_qbs // 2).index)
topwrs = wrs.drop(wrs.tail(num_of_wrs // 2).index)

In [11]:
topqbs['cpoe'].corr(topqbs['prev_cpoe'])

0.32664123182119487

In [12]:
topwrs['cpoe'].corr(topwrs['prev_cpoe'])

0.3606290727504052