## Filter Observations

In [34]:
# Loading libraries
import pandas as pd
import numpy as np
from os import path

# file path
dataDir = '/Users/simmsjn/Documents/GitHub/ltcwff-files/data'

# loading the DF
adp = pd.read_csv(path.join(dataDir, 'adp_2017.csv'))
pg = pd.read_csv(path.join(dataDir, 'player_game_2017_sample.csv'))
adp.head()

Unnamed: 0,adp,adp_formatted,bye,high,low,name,player_id,position,stdev,team,times_drafted
0,1.3,1.01,12,1,4,David Johnson,2297,RB,0.6,ARI,310
1,2.3,1.02,7,1,6,LeVeon Bell,1983,RB,0.8,PIT,303
2,3.7,1.04,7,1,7,Antonio Brown,1886,WR,1.0,PIT,338
3,5.7,1.06,9,1,15,Julio Jones,1796,WR,3.2,ATL,131
4,6.2,1.06,8,1,17,Ezekiel Elliott,2343,RB,2.8,DAL,180


In [7]:
# Using loc[] to filter observations by passing the index value 

tom_brady_id = 119

adp.loc[tom_brady_id]

adp                       116.3
adp_formatted             10.08
bye                          10
high                         89
low                         149
name             Darren Sproles
player_id                  1105
position                     RB
stdev                      10.2
team                        PHI
times_drafted               130
Name: 119, dtype: object

In [8]:
#multiple values via a list

#my_player_list = [119, 1886, 925]

#dp.loc[my_player_list]

# This is no longer supported. 
# Coming back to figure out how it should be done

In [11]:
# passing a secong argument to limit which column you return

# adp.loc[my_player_list, ['name', 'adp', 'stdev']]

# same issues here

#### Boolean Indexing

In [12]:
is_a_rb = adp['position'] == 'RB'

In [13]:
is_a_rb.head()

0     True
1     True
2    False
3    False
4     True
Name: position, dtype: bool

In [21]:
adp_rbs = adp.loc[is_a_rb]
adp_rbs[['name', 'adp', 'position']].head()

Unnamed: 0,name,adp,position
0,David Johnson,1.3,RB
1,LeVeon Bell,2.3,RB
4,Ezekiel Elliott,6.2,RB
6,Devonta Freeman,7.0,RB
7,LeSean McCoy,7.8,RB


In [20]:
# Without having to create the variable like is_a_rb then calling the loc

adp_wrs = adp.loc[adp['position'] == 'WR']

adp_wrs[['name','adp','position']].head()

Unnamed: 0,name,adp,position
2,Antonio Brown,3.7,WR
3,Julio Jones,5.7,WR
5,Odell Beckham Jr,6.4,WR
8,Mike Evans,7.9,WR
9,A.J. Green,10.0,WR


In [22]:
# Any boolean column or boolean operation works

is_a_te = adp['position'] == 'TE'

adp_not_te = adp.loc[~is_a_te]

adp_not_te[['name','adp','position']].head()

Unnamed: 0,name,adp,position
0,David Johnson,1.3,RB
1,LeVeon Bell,2.3,RB
2,Antonio Brown,3.7,WR
3,Julio Jones,5.7,WR
4,Ezekiel Elliott,6.2,RB


#### Duplicates

In [23]:
# Filter by removing duplicates

adp.drop_duplicates(inplace=True)
adp

Unnamed: 0,adp,adp_formatted,bye,high,low,name,player_id,position,stdev,team,times_drafted
0,1.3,1.01,12,1,4,David Johnson,2297,RB,0.6,ARI,310
1,2.3,1.02,7,1,6,LeVeon Bell,1983,RB,0.8,PIT,303
2,3.7,1.04,7,1,7,Antonio Brown,1886,WR,1.0,PIT,338
3,5.7,1.06,9,1,15,Julio Jones,1796,WR,3.2,ATL,131
4,6.2,1.06,8,1,17,Ezekiel Elliott,2343,RB,2.8,DAL,180
...,...,...,...,...,...,...,...,...,...,...,...
179,164.9,14.09,8,134,180,Baltimore Defense,1332,DEF,11.5,BAL,65
180,168.2,14.12,8,151,180,Dan Bailey,1891,PK,6.5,DAL,67
181,168.2,14.12,9,144,180,Atlanta Defense,1334,DEF,9.2,ATL,68
182,168.5,15.01,6,160,178,Adam Vinatieri,1195,PK,5.4,IND,46


In [24]:
# If you are intrested in only dropping a subset of variables they can be specified

adp.drop_duplicates('position')[['name','adp','position']]

Unnamed: 0,name,adp,position
0,David Johnson,1.3,RB
2,Antonio Brown,3.7,WR
17,Rob Gronkowski,18.1,TE
24,Aaron Rodgers,23.4,QB
106,Denver Defense,105.5,DEF
150,Justin Tucker,145.0,PK


In [31]:
# to identify and not drop

adp.duplicated().head()

# Here we could use any() to check for a dudplicated value
# Or we could use sum() to see the total duplicated

0    False
1    False
2    False
3    False
4    False
dtype: bool

In [32]:
# or you can call it on the specific column(s) you would like to check

adp['position'].duplicated().head()

#duplicated only identifies the duplicates after the first. passing the keep argument as false tells python otherwise

0    False
1     True
2    False
3     True
4     True
Name: position, dtype: bool

#### Combining, Filtering, and Changing Columns

In [35]:
# say we want a column of pass_yards 4 QB, rush_yards 4 RB, rec_yards 4 WR

pg['primaryYards'] = np.nan

pg.loc[pg['pos'] == 'QB', 'primaryYards'] = pg['pass_yards']
pg.loc[pg['pos'] == 'RB', 'primaryYards'] = pg['rush_yards']
pg.loc[pg['pos'] == 'WR', 'primaryYards'] = pg['rec_yards']

In [38]:
pg[['player_name','pos', 'pass_yards', 'rush_yards', 'rec_yards', 'primaryYards']].sample(5)

Unnamed: 0,player_name,pos,pass_yards,rush_yards,rec_yards,primaryYards
1048,J.Graham,TE,0.0,0.0,26.0,
762,K.Rudolph,TE,0.0,0.0,63.0,
233,D.Parker,WR,0.0,0.0,89.0,89.0
1243,M.Goodwin,WR,0.0,0.0,80.0,80.0
1244,G.Kittle,TE,0.0,0.0,16.0,


#### The query Method is an Alternative Way to Filter

In [40]:
# A less verbose alternative: query()

pg.query("pos == 'RB'").head()

Unnamed: 0,player_name,week,carries,gameid,player_id,rush_yards,rush_fumbles,rush_tds,raw_yac,rec_fumbles,...,interceptions,pass_tds,air_tds,season,team,pos,rec_yards,receptions,targets,primaryYards
3,R.Burkhead,1,3.0,2017090700,00-0030288,15.0,0.0,0.0,7.0,0.0,...,0.0,0.0,0.0,2017,NE,RB,8.0,1.0,3.0,15.0
5,K.Hunt,1,17.0,2017090700,00-0033923,148.0,1.0,1.0,76.0,0.0,...,0.0,0.0,0.0,2017,KC,RB,98.0,5.0,5.0,148.0
7,R.Burkhead,2,2.0,2017091705,00-0030288,3.0,0.0,0.0,22.0,0.0,...,0.0,0.0,0.0,2017,NE,RB,41.0,3.0,4.0,3.0
8,A.Peterson,2,8.0,2017091705,00-0025394,26.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,2017,ARI,RB,0.0,0.0,0.0,26.0
9,M.Ingram,2,8.0,2017091705,00-0027966,52.0,0.0,0.0,19.0,1.0,...,0.0,0.0,0.0,2017,NO,RB,18.0,4.0,5.0,52.0


In [41]:
# Calling boolean columns indirectly

pg['is_a_rb'] = pg['pos'] == 'RB'
pg.query('is_a_rb').head()

Unnamed: 0,player_name,week,carries,gameid,player_id,rush_yards,rush_fumbles,rush_tds,raw_yac,rec_fumbles,...,pass_tds,air_tds,season,team,pos,rec_yards,receptions,targets,primaryYards,is_a_rb
3,R.Burkhead,1,3.0,2017090700,00-0030288,15.0,0.0,0.0,7.0,0.0,...,0.0,0.0,2017,NE,RB,8.0,1.0,3.0,15.0,True
5,K.Hunt,1,17.0,2017090700,00-0033923,148.0,1.0,1.0,76.0,0.0,...,0.0,0.0,2017,KC,RB,98.0,5.0,5.0,148.0,True
7,R.Burkhead,2,2.0,2017091705,00-0030288,3.0,0.0,0.0,22.0,0.0,...,0.0,0.0,2017,NE,RB,41.0,3.0,4.0,3.0,True
8,A.Peterson,2,8.0,2017091705,00-0025394,26.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2017,ARI,RB,0.0,0.0,0.0,26.0,True
9,M.Ingram,2,8.0,2017091705,00-0027966,52.0,0.0,0.0,19.0,1.0,...,0.0,0.0,2017,NO,RB,18.0,4.0,5.0,52.0,True


In [43]:
# filter on whether raw_yac is missing

pg.query("raw_yac.notnull()",engine='python')[
    ['gameid','player_id', 'raw_yac']].head()

Unnamed: 0,gameid,player_id,raw_yac
0,2017090700,00-0019596,0.0
1,2017090700,00-0023436,0.0
2,2017090700,00-0026035,49.0
3,2017090700,00-0030288,7.0
4,2017090700,00-0030506,23.0
