In [1]:
import logging
import os

import pandas as pd

from nflscrapr import nflscrapr

logger = logging.getLogger()
logger.setLevel(logging.INFO)

In [2]:
nfl = nflscrapr()
pbp_df = nfl.get_pbp_data(2019)
pbp_df.shape

INFO:root:Getting play_by_play path for year=2019 and season=reg...
  if (yield from self.run_code(code, result)):


(45546, 256)

In [3]:
pass_df = nfl.get_pass_data(2019)
pass_df.shape

INFO:root:Getting play_by_play path for year=2019 and season=reg...
  exec(code_obj, self.user_global_ns, self.user_ns)


(19063, 256)

In [4]:
# Control 1: no filtering

cols = ["passer_player_name","pass_touchdown", "yards_gained", "pass_attempt", "interception"]

pass_df_test = pass_df[cols]

passing_tds = pass_df_test[cols].groupby(["passer_player_name"], as_index=False).sum()

passing_tds.sort_values("yards_gained", ascending=False).head(10)

Unnamed: 0,passer_player_name,pass_touchdown,yards_gained,pass_attempt,interception
61,J.Winston,33.0,4831,667.0,28.0
38,D.Prescott,30.0,4753,618.0,11.0
53,J.Goff,22.0,4468,646.0,15.0
87,P.Rivers,23.0,4395,618.0,20.0
79,M.Ryan,26.0,4152,666.0,12.0
86,P.Mahomes,26.0,3906,499.0,5.0
103,T.Brady,24.0,3874,636.0,6.0
29,D.Carr,21.0,3870,539.0,8.0
25,C.Wentz,27.0,3811,644.0,6.0
96,R.Wilson,31.0,3793,562.0,5.0


In [5]:
# Control 2: remove two point attempts

cols = ["passer_player_name","pass_touchdown", "yards_gained", "pass_attempt", "interception"]

pass_df_test = pass_df[cols][
    (pbp_df["two_point_attempt"] == 0)
]

passing_tds = pass_df[cols].groupby(["passer_player_name"], as_index=False).sum()

passing_tds.sort_values("yards_gained", ascending=False).head(10)

  


Unnamed: 0,passer_player_name,pass_touchdown,yards_gained,pass_attempt,interception
61,J.Winston,33.0,4831,667.0,28.0
38,D.Prescott,30.0,4753,618.0,11.0
53,J.Goff,22.0,4468,646.0,15.0
87,P.Rivers,23.0,4395,618.0,20.0
79,M.Ryan,26.0,4152,666.0,12.0
86,P.Mahomes,26.0,3906,499.0,5.0
103,T.Brady,24.0,3874,636.0,6.0
29,D.Carr,21.0,3870,539.0,8.0
25,C.Wentz,27.0,3811,644.0,6.0
96,R.Wilson,31.0,3793,562.0,5.0


In [6]:
# Control 3: remove sacks - now yards gained is correct. Without this, pass_attempt = number of dropbacks

cols = ["passer_player_name","pass_touchdown", "yards_gained", "pass_attempt", "interception"]

pass_df_test = pass_df[cols][
    (pbp_df["sack"] == 0) &
    (pbp_df["two_point_attempt"] == 0)
]

passing_tds = pass_df_test[cols].groupby(["passer_player_name"], as_index=False).sum()

passing_tds.sort_values("yards_gained", ascending=False).head(10)

  import sys


Unnamed: 0,passer_player_name,pass_touchdown,yards_gained,pass_attempt,interception
58,J.Winston,33.0,5109,617.0,28.0
35,D.Prescott,30.0,4902,593.0,11.0
50,J.Goff,22.0,4638,622.0,15.0
82,P.Rivers,23.0,4615,583.0,20.0
75,M.Ryan,26.0,4466,613.0,12.0
89,R.Wilson,31.0,4110,512.0,5.0
96,T.Brady,24.0,4057,607.0,6.0
28,D.Carr,21.0,4054,509.0,8.0
24,C.Wentz,27.0,4039,605.0,6.0
81,P.Mahomes,26.0,4031,481.0,5.0


In [7]:
cols = ["passer_player_name","pass_touchdown", "yards_gained", "pass_attempt", "interception"]

pass_df_test = pass_df[cols][
    (pbp_df['pass_attempt'] == 1) & 
    (pbp_df["sack"] == 0) &
    (pbp_df["two_point_attempt"] == 0)
]

passing_tds = pass_df[cols].groupby(["passer_player_name"], as_index=False).sum()

passing_tds.sort_values("yards_gained", ascending=False).head(10)

  


Unnamed: 0,passer_player_name,pass_touchdown,yards_gained,pass_attempt,interception
61,J.Winston,33.0,4831,667.0,28.0
38,D.Prescott,30.0,4753,618.0,11.0
53,J.Goff,22.0,4468,646.0,15.0
87,P.Rivers,23.0,4395,618.0,20.0
79,M.Ryan,26.0,4152,666.0,12.0
86,P.Mahomes,26.0,3906,499.0,5.0
103,T.Brady,24.0,3874,636.0,6.0
29,D.Carr,21.0,3870,539.0,8.0
25,C.Wentz,27.0,3811,644.0,6.0
96,R.Wilson,31.0,3793,562.0,5.0


In [34]:
# Ok, so when we filter on play_type == pass, we get errors.

test = pbp_df[pbp_df["pass_attempt"] == 1]

cols = ["passer_player_name","pass_touchdown", "yards_gained", "pass_attempt", "interception"]

test = test[cols][
    (pbp_df["sack"] == 0) &
    (pbp_df["two_point_attempt"] == 0)
]

passing_tds = test[cols].groupby(["passer_player_name"], as_index=False).sum()

passing_tds.sort_values("pass_attempt", ascending=False).head(10)

  if __name__ == '__main__':


Unnamed: 0,passer_player_name,pass_touchdown,yards_gained,pass_attempt,interception
51,J.Goff,22.0,4638,626.0,16.0
59,J.Winston,33.0,5109,626.0,30.0
76,M.Ryan,26.0,4466,616.0,14.0
97,T.Brady,24.0,4057,613.0,8.0
24,C.Wentz,27.0,4039,607.0,7.0
36,D.Prescott,30.0,4902,596.0,11.0
83,P.Rivers,23.0,4615,591.0,20.0
6,A.Rodgers,26.0,4002,569.0,4.0
65,K.Murray,20.0,3722,542.0,12.0
14,B.Mayfield,22.0,3827,534.0,21.0


In [22]:
a = pbp_df["play_type"] != "pass"
b = pbp_df["pass_attempt"] == 1

df = pbp_df[a & b][["desc", "play_type"]]
df.head()

Unnamed: 0,desc,play_type
262,(:02) C.Newton spiked the ball to stop the clock.,qb_spike
343,"(5:55) (No Huddle, Shotgun) C.Newton pass shor...",no_play
542,(:42) C.Keenum spiked the ball to stop the clock.,qb_spike
1178,(2:34) (Shotgun) P.Mahomes pass incomplete sho...,no_play
1657,"(1:48) (No Huddle, Shotgun) R.Wilson pass inco...",no_play


In [18]:
df.shape

(150, 2)

In [19]:
df["play_type"].unique()

array(['qb_spike', 'no_play'], dtype=object)

In [20]:
test1 = pbp_df[cols][
    (pbp_df["play_type"] == "pass") &
    (pbp_df["sack"] == 0) &
    (pbp_df["two_point_attempt"] == 0)
]

test1 = test1[cols].groupby(["passer_player_name"], as_index=False).sum()

test1.sort_values("yards_gained", ascending=False).head(10)

Unnamed: 0,passer_player_name,pass_touchdown,yards_gained,pass_attempt,interception
58,J.Winston,33.0,5109,617.0,28.0
35,D.Prescott,30.0,4902,593.0,11.0
50,J.Goff,22.0,4638,622.0,15.0
82,P.Rivers,23.0,4615,583.0,20.0
75,M.Ryan,26.0,4466,613.0,12.0
89,R.Wilson,31.0,4110,512.0,5.0
96,T.Brady,24.0,4057,607.0,6.0
28,D.Carr,21.0,4054,509.0,8.0
24,C.Wentz,27.0,4039,605.0,6.0
81,P.Mahomes,26.0,4031,481.0,5.0


In [21]:
test2 = pbp_df[cols][
    (pbp_df["pass_attempt"] == 1) &
    (pbp_df["sack"] == 0) &
    (pbp_df["two_point_attempt"] == 0)
]

test2 = test2[cols].groupby(["passer_player_name"], as_index=False).sum()

test2.sort_values("yards_gained", ascending=False).head(10)

Unnamed: 0,passer_player_name,pass_touchdown,yards_gained,pass_attempt,interception
59,J.Winston,33.0,5109,626.0,30.0
36,D.Prescott,30.0,4902,596.0,11.0
51,J.Goff,22.0,4638,626.0,16.0
83,P.Rivers,23.0,4615,591.0,20.0
76,M.Ryan,26.0,4466,616.0,14.0
90,R.Wilson,31.0,4110,516.0,5.0
97,T.Brady,24.0,4057,613.0,8.0
28,D.Carr,21.0,4054,513.0,8.0
24,C.Wentz,27.0,4039,607.0,7.0
82,P.Mahomes,26.0,4031,484.0,5.0


In [28]:
a = pbp_df["play_type"] != "pass"
b = pbp_df["pass_attempt"] == 1

df = pbp_df[a & b]
test_3 = df[cols].groupby(["passer_player_name"], as_index=False).sum()
test_3.sort_values("interception", ascending=False).head(10)

Unnamed: 0,passer_player_name,pass_touchdown,yards_gained,pass_attempt,interception
0,A.Dalton,0.0,0,12.0,2.0
6,D.Blough,0.0,0,2.0,2.0
44,T.Brady,0.0,0,6.0,2.0
33,M.Ryan,0.0,0,3.0,2.0
23,J.Winston,0.0,0,9.0,2.0
5,C.Wentz,0.0,2,3.0,1.0
14,D.Watson,0.0,0,3.0,1.0
27,L.Falk,0.0,0,1.0,1.0
26,K.Murray,0.0,0,7.0,1.0
4,C.Newton,0.0,0,2.0,1.0


In [29]:
df["play_type"].unique()

array(['qb_spike', 'no_play'], dtype=object)

In [30]:
df[df["interception"] == 1]["play_type"].unique()

array(['no_play'], dtype=object)

In [33]:
# THE ISSUE IS THESE ARE PICKS WHERE THERE ARE PENALTIES ON THE RETURN ON THE RETURN TEAM!!
df[df["interception"] == 1].to_csv( "test.csv", index=False)