## Change Granularity

#### Ways of Changing Granularity

    1. Grouping --> aggregating
        -goes from fine grained data to les fine grained data. i.e. from play to game. Involves
        a loss of information. So once data is at the game level, wew have no idea what happened 
        on any particular play
    2. Stacking/Unstacking --> reshaping
        -less common than grouping. no loss of info. crams data that was in unique rows into 
        separate columns. 

In [4]:
# Loding libraries

import pandas as pd
import numpy as np
from os import path

# file path

dataDir = '/Users/simmsjn/Documents/GitHub/ltcwff-files/data'

# loding the DF

pbp = pd.read_csv(path.join(dataDir, 'play_data_sample.csv'))
pg = pd.read_csv(path.join(dataDir, 'player_game_2017_sample.csv'))

#### Grouping

In [4]:
# groupby 

pbp.groupby('game_id').sum()

# We get a DF where every column is summed over game_id
# also, game_id is the new index
# this can be prevented by passing as_index=False argument


Unnamed: 0_level_0,play_id,posteam_score,defteam_score,qtr,yardline_100,down,ydstogo,yards_gained,rush_attempt,pass_attempt,...,punt_attempt,shotgun,no_huddle,air_yards,yards_after_catch,epa,wp,wpa,turnover,first_down
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018101412,287794,2269.0,2546.0,361,5750.0,260.0,1060,946,55.0,73.0,...,1.0,85,1,642.0,361.0,28.748338,72.384102,1.37429,3,39
2018111900,472385,3745.0,3995.0,429,7991.0,283.0,1362,1001,41.0,103.0,...,7.0,101,12,953.0,407.0,19.171737,76.67725,0.823359,7,41


In [10]:
sum_cols = ['yards_gained', 'rush_attempt', 'pass_attempt', 'shotgun']



In [11]:
# Only select columns
pbp.groupby('game_id').sum()[sum_cols]

Unnamed: 0_level_0,yards_gained,rush_attempt,pass_attempt,shotgun
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018101412,946,55.0,73.0,85
2018111900,1001,41.0,103.0,101


In [12]:
# can take the sum of the yards using a different function for other columns
# agg() function - takes a dictionary

pbp.groupby('game_id').agg({
    'yards_gained': 'sum',
    'play_id': 'count',
    'interception': 'sum',
    'touchdown': 'sum'
                           })

Unnamed: 0_level_0,yards_gained,play_id,interception,touchdown
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018101412,946,144,2.0,8.0
2018111900,1001,160,3.0,14.0


In [13]:
# Same as above

pbp.groupby('game_id').agg(
    yards_gained = ('yards_gained', 'sum'),
    nplays = ('play_id', 'count'),
    interception = ('interception', 'sum'),
    touchdown = ('touchdown', 'sum')
)

Unnamed: 0_level_0,yards_gained,nplays,interception,touchdown
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018101412,946,144,2.0,8.0
2018111900,1001,160,3.0,14.0


In [18]:
# grouping by more than one thing

yards_per_team_game = (pbp
                  .groupby(['game_id', 'posteam'])).agg(
    ave_yards_per_play = ('yards_gained', 'mean'),
    total_yards = ('yards_gained', 'sum'))

yards_per_team_game.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,ave_yards_per_play,total_yards
game_id,posteam,Unnamed: 2_level_1,Unnamed: 3_level_1
2018101412,KC,7.689655,446
2018101412,NE,6.25,500
2018111900,KC,7.479452,546
2018111900,LA,5.617284,455


#### A note on multilevel indexing

In [21]:
# you can still use the loc method w/ multilevel indexed DFs, but you need to pss it a tuple.

yards_per_team_game.loc[[(2018101412, 'NE'), (2018111900, 'LA')]]

# This can be avoided by callinig the reset_index methodimmediately after the mutli-column groubpy


Unnamed: 0_level_0,Unnamed: 1_level_0,ave_yards_per_play,total_yards
game_id,posteam,Unnamed: 2_level_1,Unnamed: 3_level_1
2018101412,NE,6.25,500
2018111900,LA,5.617284,455


#### Stacking and Unstacking Data


In [11]:
# This data is at the player and game level
qbs = pg.loc[pg['pos'] == 'QB',
             ['player_name', 'week', 'pass_tds']]
qbs.sample(5)

Unnamed: 0,player_name,week,pass_tds
235,T.Taylor,17,1.0
1424,B.Hundley,13,0.0
117,A.Smith,7,3.0
733,M.Stafford,6,5.0
164,A.Smith,16,1.0


In [9]:
# say we wanted to be at the player level

qbs_reshaped = qbs.set_index(['player_name', 'week']).unstack()
qbs_reshaped.sample(5)

Unnamed: 0_level_0,pass_tds,pass_tds,pass_tds,pass_tds,pass_tds,pass_tds,pass_tds,pass_tds,pass_tds,pass_tds,pass_tds,pass_tds,pass_tds,pass_tds,pass_tds,pass_tds,pass_tds
week,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
player_name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2
M.Stafford,5.0,2.0,2.0,0.0,2.0,5.0,,0.0,2.0,3.0,2.0,3.0,1.0,1.0,2.0,1.0,3.0
C.Newton,2.0,0.0,0.0,3.0,3.0,1.0,1.0,1.0,0.0,4.0,,0.0,2.0,1.0,4.0,0.0,1.0
M.Ryan,1.0,1.0,3.0,1.0,,1.0,1.0,2.0,2.0,2.0,2.0,1.0,0.0,1.0,,1.0,2.0
T.Taylor,2.0,0.0,2.0,1.0,1.0,,1.0,1.0,3.0,0.0,1.0,1.0,0.0,,1.0,1.0,1.0
B.Roethlisberger,2.0,2.0,1.0,1.0,2.0,1.0,2.0,1.0,,2.0,4.0,5.0,2.0,2.0,3.0,2.0,


In [13]:
# Calculating season totals

total_tds = qbs_reshaped.sum(axis=1).head()
total_tds

player_name
A.Smith             27.0
B.Bortles           22.0
B.Hundley           10.0
B.Roethlisberger    32.0
C.Newton            23.0
dtype: float64

In [14]:
# figuring out maximun number of touchdowns thrown each week

qbs_reshaped.max(axis=0).head()

          week
pass_tds  1       5.0
          2       3.0
          3       5.0
          4       3.0
          5       3.0
dtype: float64

In [19]:
# Let's say we wanted to restack the entire thing

qbs_reshaped_undo = qbs_reshaped.stack()
qbs_reshaped_undo.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,pass_tds
player_name,week,Unnamed: 2_level_1
A.Smith,1,4.0
A.Smith,2,1.0
A.Smith,3,2.0
A.Smith,4,1.0
A.Smith,5,3.0
