## Refactor the team rushing and receiving stats to include positional data

In [9]:
import logging
import os
import sys

import numpy as np
import pandas as pd

sys.path.insert(0, '..')

from app import db

pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 50)

  pd.set_option('display.max_colwidth', -1)


In [10]:
db_conn = db.get_db_conn()

In [11]:
def _extract(db_conn) -> pd.DataFrame:
    """Getting the raw rushing_by_player_by_game stats."""
    logging.info("Extracting rushing stats by player by year from play by play...")
    query = """SELECT * FROM rushing_by_player_by_game"""
    df = pd.read_sql(query, db_conn)
    logging.info(f"Extracted {len(df)} rows of rushing by team by game stats.")
    return df


def _transform_all(df: pd.DataFrame) -> pd.DataFrame:
    grouping_cols = ['year', 'season_type', 'game_id', 'team', 'opp', 'week']
    return df.groupby(grouping_cols, as_index=False).sum()


def _transform_by_position(df: pd.DataFrame) -> pd.DataFrame:
    grouping_cols = ['year', 'season_type', 'game_id', 'team', 'opp', 'week', 'pos']
    return df.groupby(grouping_cols, as_index=False).sum()

In [12]:
df = _extract(db_conn)
df = _transform_all(df)
print(df.shape)
df.head()

(536, 41)


Unnamed: 0,year,season_type,game_id,team,opp,week,attempts_total,yards_total,td_total,fumbles_total,fumbles_lost_total,fumbles_out_of_bounds_total,epa_total,attempts_designed,yards_designed,td_designed,fumbles_designed,fumbles_lost_designed,fumbles_out_of_bounds_designed,epa_designed,attempts_scramble,yards_scramble,td_scramble,fumbles_scramble,fumbles_lost_scramble,fumbles_out_of_bounds_scramble,epa_scramble,attempts_kneel,yards_kneel,td_kneel,fumbles_kneel,fumbles_lost_kneel,fumbles_out_of_bounds_kneel,epa_kneel,attempts,yards,td,fumbles,fumbles_lost,fumbles_out_of_bounds,epa
0,2020,POST,2020_18_BAL_TEN,BAL,TEN,18,35,236,2,0,0,0,6.378529,32,190,1,0,0,0,3.984501,1,48,1,0,0,0,5.689723,2,-2,0,0,0,0,-3.295695,33,238,2,0,0,0,9.674223
1,2020,POST,2020_18_BAL_TEN,TEN,BAL,18,22,51,0,0,0,0,-4.666149,21,47,0,0,0,0,-4.618171,1,4,0,0,0,0,-0.047978,0,0,0,0,0,0,0.0,22,51,0,0,0,0,-4.666149
2,2020,POST,2020_18_CHI_NO,CHI,NO,18,19,48,0,0,0,0,-8.769047,16,38,0,0,0,0,-5.190295,3,10,0,0,0,0,-3.578752,0,0,0,0,0,0,0.0,19,48,0,0,0,0,-8.769047
3,2020,POST,2020_18_CHI_NO,NO,CHI,18,35,123,1,1,0,0,-8.546762,35,123,1,1,0,0,-8.546762,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0.0,35,123,1,1,0,0,-8.546762
4,2020,POST,2020_18_CLE_PIT,CLE,PIT,18,31,127,2,0,0,0,-1.390041,26,124,2,0,0,0,0.798019,2,8,0,0,0,0,-0.131966,3,-5,0,0,0,0,-2.056094,28,132,2,0,0,0,0.666054


In [13]:
'''
nfl=# select pos, count(pos) from rushing_by_player_by_year group by pos;
 pos | count
-----+-------
 CB  |     1
 LB  |     2
 RB  |   185
 QB  |    99
 SS  |     3
 C   |     1
 FB  |    11
 P   |     5
 WR  |   120
 DB  |     5
 TE  |    12
 '''

'\nnfl=# select pos, count(pos) from rushing_by_player_by_year group by pos;\n pos | count\n-----+-------\n CB  |     1\n LB  |     2\n RB  |   185\n QB  |    99\n SS  |     3\n C   |     1\n FB  |    11\n P   |     5\n WR  |   120\n DB  |     5\n TE  |    12\n '

In [14]:
## get the positional stats
query = """
    SELECT
        year,
        season_type,
        game_id,
        team,
        opp,
        week,
        SUM(CASE WHEN pos = 'RB' THEN attempts ELSE 0 END) AS attempts_rb,
        SUM(CASE WHEN pos = 'RB' THEN yards ELSE 0 END) AS yards_rb,
        SUM(CASE WHEN pos = 'RB' THEN td ELSE 0 END) AS td_rb,
        SUM(CASE WHEN pos = 'RB' THEN epa ELSE 0 END) AS epa_rb,
        
        SUM(CASE WHEN pos = 'QB' THEN attempts_designed ELSE 0 END) AS attempts_qb_designed,
        SUM(CASE WHEN pos = 'QB' THEN yards_designed ELSE 0 END) AS yards_qb_designed,
        SUM(CASE WHEN pos = 'QB' THEN td_designed ELSE 0 END) AS td_qb_designed,
        SUM(CASE WHEN pos = 'QB' THEN epa_designed ELSE 0 END) AS epa_qb_designed,
        
        SUM(CASE WHEN pos = 'QB' THEN attempts_scramble ELSE 0 END) AS attempts_qb_scramble,
        SUM(CASE WHEN pos = 'QB' THEN yards_scramble ELSE 0 END) AS yards_qb_scramble,
        SUM(CASE WHEN pos = 'QB' THEN td_scramble ELSE 0 END) AS td_qb_scramble,
        SUM(CASE WHEN pos = 'QB' THEN epa_scramble ELSE 0 END) AS epa_qb_scramble,

        SUM(CASE WHEN pos = 'QB' THEN attempts_kneel ELSE 0 END) AS attempts_qb_kneel,
        SUM(CASE WHEN pos = 'QB' THEN yards_kneel ELSE 0 END) AS yards_qb_kneel,
        SUM(CASE WHEN pos = 'QB' THEN epa_kneel ELSE 0 END) AS epa_qb_kneel,
        
        SUM(CASE WHEN pos = 'WR' THEN attempts ELSE 0 END) AS attempts_wr,
        SUM(CASE WHEN pos = 'WR' THEN yards ELSE 0 END) AS yards_wr,
        SUM(CASE WHEN pos = 'WR' THEN td ELSE 0 END) AS td_wr,
        SUM(CASE WHEN pos = 'WR' THEN epa ELSE 0 END) AS epa_wr,
        
        SUM(CASE WHEN pos NOT IN ('RB', 'WR', 'QB') THEN attempts ELSE 0 END) as attempts_other,
        SUM(CASE WHEN pos NOT IN ('RB', 'WR', 'QB') THEN yards ELSE 0 END) as yards_other,
        SUM(CASE WHEN pos NOT IN ('RB', 'WR', 'QB') THEN td ELSE 0 END) as td_other,
        SUM(CASE WHEN pos NOT IN ('RB', 'WR', 'QB') THEN epa ELSE 0 END) as epa_other

    FROM
        rushing_by_player_by_game
    GROUP BY
        year, season_type, game_id, team, opp, week
"""
df_position = pd.read_sql(query, db_conn)
print(df_position.shape)
df_position.head()

(536, 29)


Unnamed: 0,year,season_type,game_id,team,opp,week,attempts_rb,yards_rb,td_rb,epa_rb,attempts_qb_designed,yards_qb_designed,td_qb_designed,epa_qb_designed,attempts_qb_scramble,yards_qb_scramble,td_qb_scramble,epa_qb_scramble,attempts_qb_kneel,yards_qb_kneel,epa_qb_kneel,attempts_wr,yards_wr,td_wr,epa_wr,attempts_other,yards_other,td_other,epa_other
0,2020,REG,2020_04_ARI_CAR,ARI,CAR,4,17,51,0,-1.366991,3,18,0,1.018741,3,60,0,3.509554,0,0,0.0,0,0,0,0.0,0,0,0,0.0
1,2020,REG,2020_15_BUF_DEN,BUF,DEN,15,21,149,1,2.064642,1,8,0,0.496162,2,25,2,4.172348,0,0,0.0,0,0,0,0.0,0,0,0,0.0
2,2020,REG,2020_09_CHI_TEN,TEN,CHI,9,28,89,0,-4.055348,0,0,0,0.0,1,5,0,0.034573,2,-2,-1.330258,0,0,0,0.0,0,0,0,0.0
3,2020,REG,2020_04_JAX_CIN,JAX,CIN,4,17,75,0,0.144375,0,0,0,0.0,2,9,0,-1.410471,0,0,0.0,1,5,0,0.154696,0,0,0,0.0
4,2020,REG,2020_12_WAS_DAL,WAS,DAL,12,32,178,3,10.568463,0,0,0,0.0,1,3,0,-0.306639,2,-2,-3.356562,0,0,0,0.0,1,3,0,0.578134


In [15]:
join_cols = ['year', 'season_type', 'game_id', 'team', 'opp', 'week']
df = df.merge(df_position, on=join_cols)
print(df.shape)
df.head()

(536, 64)


Unnamed: 0,year,season_type,game_id,team,opp,week,attempts_total,yards_total,td_total,fumbles_total,fumbles_lost_total,fumbles_out_of_bounds_total,epa_total,attempts_designed,yards_designed,td_designed,fumbles_designed,fumbles_lost_designed,fumbles_out_of_bounds_designed,epa_designed,attempts_scramble,yards_scramble,td_scramble,fumbles_scramble,fumbles_lost_scramble,fumbles_out_of_bounds_scramble,epa_scramble,attempts_kneel,yards_kneel,td_kneel,fumbles_kneel,fumbles_lost_kneel,fumbles_out_of_bounds_kneel,epa_kneel,attempts,yards,td,fumbles,fumbles_lost,fumbles_out_of_bounds,epa,attempts_rb,yards_rb,td_rb,epa_rb,attempts_qb_designed,yards_qb_designed,td_qb_designed,epa_qb_designed,attempts_qb_scramble,yards_qb_scramble,td_qb_scramble,epa_qb_scramble,attempts_qb_kneel,yards_qb_kneel,epa_qb_kneel,attempts_wr,yards_wr,td_wr,epa_wr,attempts_other,yards_other,td_other,epa_other
0,2020,POST,2020_18_BAL_TEN,BAL,TEN,18,35,236,2,0,0,0,6.378529,32,190,1,0,0,0,3.984501,1,48,1,0,0,0,5.689723,2,-2,0,0,0,0,-3.295695,33,238,2,0,0,0,9.674223,17,81,1,1.040559,13,90,0,1.531735,1,48,1,5.689723,2,-2,-3.295695,2,19,0,1.412207,0,0,0,0.0
1,2020,POST,2020_18_BAL_TEN,TEN,BAL,18,22,51,0,0,0,0,-4.666149,21,47,0,0,0,0,-4.618171,1,4,0,0,0,0,-0.047978,0,0,0,0,0,0,0.0,22,51,0,0,0,0,-4.666149,20,45,0,-6.646984,1,2,0,2.028814,1,4,0,-0.047978,0,0,0.0,0,0,0,0.0,0,0,0,0.0
2,2020,POST,2020_18_CHI_NO,CHI,NO,18,19,48,0,0,0,0,-8.769047,16,38,0,0,0,0,-5.190295,3,10,0,0,0,0,-3.578752,0,0,0,0,0,0,0.0,19,48,0,0,0,0,-8.769047,13,31,0,-4.133066,0,0,0,0.0,3,10,0,-3.578752,0,0,0.0,3,7,0,-1.057229,0,0,0,0.0
3,2020,POST,2020_18_CHI_NO,NO,CHI,18,35,123,1,1,0,0,-8.546762,35,123,1,1,0,0,-8.546762,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0.0,35,123,1,1,0,0,-8.546762,27,108,1,-2.411689,7,17,0,-4.792657,0,0,0,0.0,0,0,0.0,1,-2,0,-1.342416,0,0,0,0.0
4,2020,POST,2020_18_CLE_PIT,CLE,PIT,18,31,127,2,0,0,0,-1.390041,26,124,2,0,0,0,0.798019,2,8,0,0,0,0,-0.131966,3,-5,0,0,0,0,-2.056094,28,132,2,0,0,0,0.666054,26,124,2,0.798019,0,0,0,0.0,2,8,0,-0.131966,3,-5,-2.056094,0,0,0,0.0,0,0,0,0.0


In [16]:
df.to_csv('rushing.csv', index=False)