# Use Built-in Pandas Functions That Work on DataFrames

In [14]:
# importing libraries
import pandas as pd
import numpy as np
from os import path

# File path
dataDir = '/Users/simmsjn/Documents/Github/ltcwff-files/data'

# DataFrame
adp = pd.read_csv(path.join(dataDir, 'adp_2017.csv'))
pg = pd.read_csv(path.join(dataDir, 'player_game_2017_sample.csv'))

#### Summary Statistic Functions

In [4]:
# mean

adp.mean()

adp                88.582609
adp_formatted       7.897391
bye                 8.907609
high               67.108696
low               107.250000
player_id        1850.195652
stdev               8.338587
times_drafted     164.614130
dtype: float64

In [5]:
# max

adp.max()

adp                  172.7
adp_formatted        15.05
bye                     12
high                   162
low                    180
name             Zay Jones
player_id             2523
position                WR
stdev                 21.3
team                   WAS
times_drafted          338
dtype: object

In [8]:
# Other summary statistics

#std
print(adp.std())

# count
print(adp.count())

# sum
print(adp.sum())

# min

print(adp.min())

adp               49.770300
adp_formatted      4.139712
bye                2.234148
high              41.788292
low               54.352950
player_id        566.762462
stdev              4.253276
times_drafted     77.900511
dtype: float64
adp              184
adp_formatted    184
bye              184
high             184
low              184
name             184
player_id        184
position         184
stdev            184
team             184
times_drafted    184
dtype: int64
adp                                                        16299.2
adp_formatted                                              1453.12
bye                                                           1639
high                                                         12348
low                                                          19734
name             David JohnsonLeVeon BellAntonio BrownJulio Jon...
player_id                                                   340436
position         RBRBWRWRRBWRRBRBWRWRRBWRWRRBRBR

#### Axis

In [9]:
# axis = 0 - Rows | axis = 1 - Columns
# axis = 0 is the default so it doesn't need to be explicitly passed

adp[['adp', 'low', 'high', 'stdev']].mean(axis=0)

adp       88.582609
low      107.250000
high      67.108696
stdev      8.338587
dtype: float64

In [12]:
adp[['adp', 'low', 'high', 'stdev']].mean(axis=1).head()

0    1.725
1    2.525
2    3.175
3    6.225
4    6.750
dtype: float64

#### Summary Functions on Boolean Columns

In [16]:
# pandas treats boolean columns as 0 - False and 1 - True

pg['good_rb_game'] = ((pg['pos'] == 'RB') &
                     (pg['rush_yards'] == 100))
pg.head()

Unnamed: 0,player_name,week,carries,gameid,player_id,rush_yards,rush_fumbles,rush_tds,raw_yac,rec_fumbles,...,interceptions,pass_tds,air_tds,season,team,pos,rec_yards,receptions,targets,good_rb_game
0,T.Brady,1,0.0,2017090700,00-0019596,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,2017,NE,QB,0.0,0.0,0.0,False
1,A.Smith,1,2.0,2017090700,00-0023436,9.0,0.0,0.0,0.0,0.0,...,0.0,4.0,1.0,2017,KC,QB,0.0,0.0,0.0,False
2,D.Amendola,1,0.0,2017090700,00-0026035,0.0,0.0,0.0,49.0,1.0,...,0.0,0.0,0.0,2017,NE,WR,100.0,6.0,7.0,False
3,R.Burkhead,1,3.0,2017090700,00-0030288,15.0,0.0,0.0,7.0,0.0,...,0.0,0.0,0.0,2017,NE,RB,8.0,1.0,3.0,False
4,T.Kelce,1,1.0,2017090700,00-0030506,4.0,0.0,0.0,23.0,0.0,...,0.0,0.0,0.0,2017,KC,TE,40.0,5.0,7.0,False


In [17]:
# mean
pg['good_rb_game'].mean()

0.0020964360587002098

In [24]:
# sum

pg['good_rb_game'].sum()

3

Boolean specific functions

    all: which returns True if ALL vals in column are True
    any: which returns True if ANY vals in the column are True

These also take the axis argument

In [25]:
# any

(pg['pass_yards'] > 400).any() 

True

In [29]:
# all
(pg['rush_yards'] >= 0).all()

False

In [30]:
# any(axis = ) to look at how often a players rush yards or receiving yards were greater than 100

(pg[['rush_yards', 'rec_yards']] > 100).any(axis = 1)

0       False
1       False
2       False
3       False
4       False
        ...  
1426    False
1427    False
1428    False
1429    False
1430    False
Length: 1431, dtype: bool

In [33]:
# How often?
(pg[['rush_yards', 'rec_yards']] > 100).any(axis = 1).sum()

100

In [34]:
# how about both > 100?

(pg[['rush_yards', 'rec_yards']] > 100).all(axis = 1).sum()

# Never

0

In [35]:
# What if we lower the bar?

(pg[['rush_yards', 'rec_yards']] > 75).all(axis = 1).sum()

# only 4

4

#### Other Misc Built-in Summary Functions

In [37]:
# value_counts - summarizes the frequency of individual values

adp['position'].value_counts()

WR     63
RB     62
QB     21
TE     17
DEF    13
PK      8
Name: position, dtype: int64

In [45]:
# normalizing - dividing each by the total so they add up to 1 and represent proportions

adp['position'].value_counts(normalize = True)

# each values given represents a percentage. so WR - 34.2%, RB - 33.7% etc...

WR     0.342391
RB     0.336957
QB     0.114130
TE     0.092391
DEF    0.070652
PK     0.043478
Name: position, dtype: float64

In [48]:
# also useful: crosstab - frequencies for all the combinations between TWO columns

pd.crosstab(adp['team'], adp['position']).head()

# also take the optional normalize

position,DEF,PK,QB,RB,TE,WR
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ARI,1,0,1,2,0,2
ATL,1,1,1,2,1,3
BAL,1,1,0,2,0,2
BUF,0,0,1,1,0,3
CAR,1,0,1,2,1,0
