In [1]:
from typing import List
import numpy as np
import pandas as pd

In [2]:
tips = pd.read_csv('examples/tips.csv')
tips

Unnamed: 0,total_bill,tip,smoker,day,time,size
0,16.99,1.01,No,Sun,Dinner,2
1,10.34,1.66,No,Sun,Dinner,3
2,21.01,3.50,No,Sun,Dinner,3
3,23.68,3.31,No,Sun,Dinner,2
4,24.59,3.61,No,Sun,Dinner,4
...,...,...,...,...,...,...
239,29.03,5.92,No,Sat,Dinner,3
240,27.18,2.00,Yes,Sat,Dinner,2
241,22.67,2.00,Yes,Sat,Dinner,2
242,17.82,1.75,No,Sat,Dinner,2


In [3]:
tips['tip_pct'] = tips['tip'] / (tips['tip'] + tips['total_bill'])

In [4]:
tips

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
0,16.99,1.01,No,Sun,Dinner,2,0.056111
1,10.34,1.66,No,Sun,Dinner,3,0.138333
2,21.01,3.50,No,Sun,Dinner,3,0.142799
3,23.68,3.31,No,Sun,Dinner,2,0.122638
4,24.59,3.61,No,Sun,Dinner,4,0.128014
...,...,...,...,...,...,...,...
239,29.03,5.92,No,Sat,Dinner,3,0.169385
240,27.18,2.00,Yes,Sat,Dinner,2,0.068540
241,22.67,2.00,Yes,Sat,Dinner,2,0.081070
242,17.82,1.75,No,Sat,Dinner,2,0.089423


In [5]:
##functions we pass to apply may be return either a Pandas object or a scalar value.
#the remainder of chapter 10 will include examples, starting with the tips dataset we loaded above.
result = tips.groupby('smoker')['tip_pct'].describe()

In [6]:
result

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
No,151.0,0.136419,0.029645,0.053744,0.120419,0.134667,0.156128,0.226
Yes,93.0,0.136454,0.054017,0.034412,0.096471,0.133333,0.163221,0.415323


In [7]:
result.unstack('smoker')

       smoker
count  No        151.000000
       Yes        93.000000
mean   No          0.136419
       Yes         0.136454
std    No          0.029645
       Yes         0.054017
min    No          0.053744
       Yes         0.034412
25%    No          0.120419
       Yes         0.096471
50%    No          0.134667
       Yes         0.133333
75%    No          0.156128
       Yes         0.163221
max    No          0.226000
       Yes         0.415323
dtype: float64

In [8]:
#inside groupby, when we invoke describe it is really just this shortcut:
def f(group):
    return group.describe()

result.apply(f)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
count,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
mean,122.0,0.136437,0.041831,0.044078,0.108445,0.134,0.159675,0.320661
std,41.012193,2.4e-05,0.017233,0.01367,0.016934,0.000943,0.005015,0.133871
min,93.0,0.136419,0.029645,0.034412,0.096471,0.133333,0.156128,0.226
25%,107.5,0.136428,0.035738,0.039245,0.102458,0.133667,0.157901,0.273331
50%,122.0,0.136437,0.041831,0.044078,0.108445,0.134,0.159675,0.320661
75%,136.5,0.136445,0.047924,0.048911,0.114432,0.134334,0.161448,0.367992
max,151.0,0.136454,0.054017,0.053744,0.120419,0.134667,0.163221,0.415323


In [9]:
def top(df: pd.DataFrame, n: int=5, column: str='tip_pct'):
    return df.sort_values(column, ascending=False)[:n]

#suppressing group keys
tips.groupby('smoker', group_keys=False).apply(top)

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
232,11.61,3.39,No,Sat,Dinner,2,0.226
149,7.51,2.0,No,Thur,Lunch,2,0.210305
51,10.29,2.6,No,Sun,Dinner,2,0.201707
185,20.69,5.0,No,Sun,Dinner,5,0.194628
88,24.71,5.85,No,Thur,Lunch,2,0.191427
172,7.25,5.15,Yes,Sun,Dinner,2,0.415323
178,9.6,4.0,Yes,Sun,Dinner,2,0.294118
67,3.07,1.0,Yes,Sat,Dinner,1,0.2457
183,23.17,6.5,Yes,Sun,Dinner,4,0.219077
109,14.31,4.0,Yes,Sat,Dinner,2,0.21846


In [10]:
#quantile and bucket analysis:
#pd.cut and pd.qcut let us separate up the data into buckets
#combine with groupby lets us easily perform bucket or quantile analysis on a Dataset
frame = pd.DataFrame({'data1': np.random.standard_normal(1000),
                     'data2': np.random.standard_normal(1000)})

In [11]:
frame.head()

Unnamed: 0,data1,data2
0,1.270663,0.703616
1,0.011528,0.330437
2,-0.073277,-0.899604
3,0.224002,0.341839
4,1.224541,-0.579144


In [12]:
#cut the frame into four parts
quartiles = pd.cut(frame['data1'], 4)

In [13]:
quartiles.head(n=10)

0      (-0.178, 1.51]
1      (-0.178, 1.51]
2      (-0.178, 1.51]
3      (-0.178, 1.51]
4      (-0.178, 1.51]
5      (-0.178, 1.51]
6      (-0.178, 1.51]
7      (-0.178, 1.51]
8    (-1.866, -0.178]
9       (1.51, 3.198]
Name: data1, dtype: category
Categories (4, interval[float64, right]): [(-3.561, -1.866] < (-1.866, -0.178] < (-0.178, 1.51] < (1.51, 3.198]]

In [14]:
#same as above
quartiles.head(10)

0      (-0.178, 1.51]
1      (-0.178, 1.51]
2      (-0.178, 1.51]
3      (-0.178, 1.51]
4      (-0.178, 1.51]
5      (-0.178, 1.51]
6      (-0.178, 1.51]
7      (-0.178, 1.51]
8    (-1.866, -0.178]
9       (1.51, 3.198]
Name: data1, dtype: category
Categories (4, interval[float64, right]): [(-3.561, -1.866] < (-1.866, -0.178] < (-0.178, 1.51] < (1.51, 3.198]]

In [15]:
#we can pass the Categorical object obtained by pd.cut directly to groupby
#compute a set of group statistics like below
def get_stats(group):
    return pd.DataFrame(
        {'min': group.min(), 'max': group.max(),
        'count': group.count(), 'mean': group.mean()}
    )

In [16]:
grouped = frame.groupby(quartiles)

In [17]:
grouped

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fddd0deceb0>

In [18]:
grouped.apply(get_stats)

Unnamed: 0_level_0,Unnamed: 1_level_0,min,max,count,mean
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"(-3.561, -1.866]",data1,-3.553842,-1.868234,34,-2.337031
"(-3.561, -1.866]",data2,-1.894785,2.424788,34,0.120098
"(-1.866, -0.178]",data1,-1.841244,-0.178694,380,-0.752067
"(-1.866, -0.178]",data2,-3.694924,3.255395,380,0.032628
"(-0.178, 1.51]",data1,-0.173075,1.488865,517,0.495903
"(-0.178, 1.51]",data2,-2.880263,2.872627,517,-0.054702
"(1.51, 3.198]",data1,1.510776,3.198367,69,1.997722
"(1.51, 3.198]",data2,-2.309292,1.899891,69,-0.113597


In [19]:
#Note: we could also have gotten the same result with the following
grouped.agg(['min', 'max', 'count', 'mean'])

Unnamed: 0_level_0,data1,data1,data1,data1,data2,data2,data2,data2
Unnamed: 0_level_1,min,max,count,mean,min,max,count,mean
data1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
"(-3.561, -1.866]",-3.553842,-1.868234,34,-2.337031,-1.894785,2.424788,34,0.120098
"(-1.866, -0.178]",-1.841244,-0.178694,380,-0.752067,-3.694924,3.255395,380,0.032628
"(-0.178, 1.51]",-0.173075,1.488865,517,0.495903,-2.880263,2.872627,517,-0.054702
"(1.51, 3.198]",1.510776,3.198367,69,1.997722,-2.309292,1.899891,69,-0.113597


In [20]:
#pd.cut in the above situation creates equal-length buckets
#pd.qcut can create equal size buckets based on the sample quantiles
#pass bins=4 and labels=False to obtain the relevant quartile indices
quartiles_samp = pd.qcut(frame['data1'], 4, labels=False)

In [21]:
quartiles_samp.head()

0    3
1    1
2    1
3    2
4    3
Name: data1, dtype: int64

In [22]:
grouped = frame.groupby(quartiles_samp)

In [23]:
grouped

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fddd0ded7b0>

In [24]:
grouped.apply(get_stats)

Unnamed: 0_level_0,Unnamed: 1_level_0,min,max,count,mean
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,data1,-3.553842,-0.572485,250,-1.224232
0,data2,-3.694924,3.255395,250,0.105618
1,data1,-0.572126,0.019718,250,-0.260249
1,data2,-2.526946,2.760798,250,-0.080533
2,data1,0.021298,0.647009,250,0.330392
2,data2,-2.880263,2.376868,250,-0.069037
3,data1,0.648736,3.198367,250,1.270009
3,data2,-2.67321,2.872627,250,-0.034596


In [25]:
#Example: fill missing values with group-specific values.
#When we clean up missing data, we might want to drop some of the observations using dropna.
#We might aslo want to do the opposite. That is, filling values based on some criteria.
#fillna is the proper tool for this set of circumstances.
s = pd.Series(np.random.standard_normal(6))
s[::2] = np.nan

In [26]:
s

0         NaN
1    0.909431
2         NaN
3    0.058836
4         NaN
5    1.366490
dtype: float64

In [27]:
s.fillna(s.mean())

0    0.778252
1    0.909431
2    0.778252
3    0.058836
4    0.778252
5    1.366490
dtype: float64

In [28]:
#We need the fill data to vary by group. We can group the data, and then use fillna and call apply(fillna) on each chunk.
#data: Sample data from states, with eastern and western regions.
states: List[str] = ['Ohio', 'New York', 'Vermont', 'Florida',
                    'Oregon', 'Nevada', 'California', 'Idaho']