In [1]:
import functools
import operator
from typing import List
import numpy as np
import pandas as pd

In [2]:
tips = pd.read_csv('examples/tips.csv')
tips

Unnamed: 0,total_bill,tip,smoker,day,time,size
0,16.99,1.01,No,Sun,Dinner,2
1,10.34,1.66,No,Sun,Dinner,3
2,21.01,3.50,No,Sun,Dinner,3
3,23.68,3.31,No,Sun,Dinner,2
4,24.59,3.61,No,Sun,Dinner,4
...,...,...,...,...,...,...
239,29.03,5.92,No,Sat,Dinner,3
240,27.18,2.00,Yes,Sat,Dinner,2
241,22.67,2.00,Yes,Sat,Dinner,2
242,17.82,1.75,No,Sat,Dinner,2


In [3]:
tips['tip_pct'] = tips['tip'] / (tips['tip'] + tips['total_bill'])

In [4]:
tips

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
0,16.99,1.01,No,Sun,Dinner,2,0.056111
1,10.34,1.66,No,Sun,Dinner,3,0.138333
2,21.01,3.50,No,Sun,Dinner,3,0.142799
3,23.68,3.31,No,Sun,Dinner,2,0.122638
4,24.59,3.61,No,Sun,Dinner,4,0.128014
...,...,...,...,...,...,...,...
239,29.03,5.92,No,Sat,Dinner,3,0.169385
240,27.18,2.00,Yes,Sat,Dinner,2,0.068540
241,22.67,2.00,Yes,Sat,Dinner,2,0.081070
242,17.82,1.75,No,Sat,Dinner,2,0.089423


In [5]:
##functions we pass to apply may be return either a Pandas object or a scalar value.
#the remainder of chapter 10 will include examples, starting with the tips dataset we loaded above.
result = tips.groupby('smoker')['tip_pct'].describe()

In [6]:
result

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
No,151.0,0.136419,0.029645,0.053744,0.120419,0.134667,0.156128,0.226
Yes,93.0,0.136454,0.054017,0.034412,0.096471,0.133333,0.163221,0.415323


In [7]:
result.unstack('smoker')

       smoker
count  No        151.000000
       Yes        93.000000
mean   No          0.136419
       Yes         0.136454
std    No          0.029645
       Yes         0.054017
min    No          0.053744
       Yes         0.034412
25%    No          0.120419
       Yes         0.096471
50%    No          0.134667
       Yes         0.133333
75%    No          0.156128
       Yes         0.163221
max    No          0.226000
       Yes         0.415323
dtype: float64

In [8]:
#inside groupby, when we invoke describe it is really just this shortcut:
def f(group):
    return group.describe()

result.apply(f)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
count,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
mean,122.0,0.136437,0.041831,0.044078,0.108445,0.134,0.159675,0.320661
std,41.012193,2.4e-05,0.017233,0.01367,0.016934,0.000943,0.005015,0.133871
min,93.0,0.136419,0.029645,0.034412,0.096471,0.133333,0.156128,0.226
25%,107.5,0.136428,0.035738,0.039245,0.102458,0.133667,0.157901,0.273331
50%,122.0,0.136437,0.041831,0.044078,0.108445,0.134,0.159675,0.320661
75%,136.5,0.136445,0.047924,0.048911,0.114432,0.134334,0.161448,0.367992
max,151.0,0.136454,0.054017,0.053744,0.120419,0.134667,0.163221,0.415323


In [9]:
def top(df: pd.DataFrame, n: int=5, column: str='tip_pct'):
    return df.sort_values(column, ascending=False)[:n]

#suppressing group keys
tips.groupby('smoker', group_keys=False).apply(top)

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
232,11.61,3.39,No,Sat,Dinner,2,0.226
149,7.51,2.0,No,Thur,Lunch,2,0.210305
51,10.29,2.6,No,Sun,Dinner,2,0.201707
185,20.69,5.0,No,Sun,Dinner,5,0.194628
88,24.71,5.85,No,Thur,Lunch,2,0.191427
172,7.25,5.15,Yes,Sun,Dinner,2,0.415323
178,9.6,4.0,Yes,Sun,Dinner,2,0.294118
67,3.07,1.0,Yes,Sat,Dinner,1,0.2457
183,23.17,6.5,Yes,Sun,Dinner,4,0.219077
109,14.31,4.0,Yes,Sat,Dinner,2,0.21846


In [10]:
#quantile and bucket analysis:
#pd.cut and pd.qcut let us separate up the data into buckets
#combine with groupby lets us easily perform bucket or quantile analysis on a Dataset
frame = pd.DataFrame({'data1': np.random.standard_normal(1000),
                     'data2': np.random.standard_normal(1000)})

In [11]:
frame.head()

Unnamed: 0,data1,data2
0,-0.075669,1.850556
1,0.547327,0.840329
2,0.239275,2.672239
3,-0.721486,1.319259
4,2.174544,1.1794


In [12]:
#cut the frame into four parts
quartiles = pd.cut(frame['data1'], 4)

In [13]:
quartiles.head(n=10)

0     (-1.001, 0.638]
1     (-1.001, 0.638]
2     (-1.001, 0.638]
3     (-1.001, 0.638]
4      (0.638, 2.277]
5    (-2.647, -1.001]
6     (-1.001, 0.638]
7     (-1.001, 0.638]
8      (0.638, 2.277]
9    (-2.647, -1.001]
Name: data1, dtype: category
Categories (4, interval[float64, right]): [(-2.647, -1.001] < (-1.001, 0.638] < (0.638, 2.277] < (2.277, 3.916]]

In [14]:
#same as above
quartiles.head(10)

0     (-1.001, 0.638]
1     (-1.001, 0.638]
2     (-1.001, 0.638]
3     (-1.001, 0.638]
4      (0.638, 2.277]
5    (-2.647, -1.001]
6     (-1.001, 0.638]
7     (-1.001, 0.638]
8      (0.638, 2.277]
9    (-2.647, -1.001]
Name: data1, dtype: category
Categories (4, interval[float64, right]): [(-2.647, -1.001] < (-1.001, 0.638] < (0.638, 2.277] < (2.277, 3.916]]

In [15]:
#we can pass the Categorical object obtained by pd.cut directly to groupby
#compute a set of group statistics like below
def get_stats(group):
    return pd.DataFrame(
        {'min': group.min(), 'max': group.max(),
        'count': group.count(), 'mean': group.mean()}
    )

In [16]:
grouped = frame.groupby(quartiles)

In [17]:
grouped

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fe0b05619f0>

In [18]:
grouped.apply(get_stats)

Unnamed: 0_level_0,Unnamed: 1_level_0,min,max,count,mean
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"(-2.647, -1.001]",data1,-2.640178,-1.008004,163,-1.499142
"(-2.647, -1.001]",data2,-2.787575,2.773949,163,-0.07518
"(-1.001, 0.638]",data1,-0.999407,0.637435,568,-0.13039
"(-1.001, 0.638]",data2,-3.216128,3.880993,568,0.017592
"(0.638, 2.277]",data1,0.644782,2.274246,261,1.203011
"(0.638, 2.277]",data2,-2.644741,3.18271,261,-0.028749
"(2.277, 3.916]",data1,2.37318,3.916392,8,2.797382
"(2.277, 3.916]",data2,-1.205173,1.774882,8,0.032733


In [19]:
#Note: we could also have gotten the same result with the following
grouped.agg(['min', 'max', 'count', 'mean'])

Unnamed: 0_level_0,data1,data1,data1,data1,data2,data2,data2,data2
Unnamed: 0_level_1,min,max,count,mean,min,max,count,mean
data1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
"(-2.647, -1.001]",-2.640178,-1.008004,163,-1.499142,-2.787575,2.773949,163,-0.07518
"(-1.001, 0.638]",-0.999407,0.637435,568,-0.13039,-3.216128,3.880993,568,0.017592
"(0.638, 2.277]",0.644782,2.274246,261,1.203011,-2.644741,3.18271,261,-0.028749
"(2.277, 3.916]",2.37318,3.916392,8,2.797382,-1.205173,1.774882,8,0.032733


In [20]:
#pd.cut in the above situation creates equal-length buckets
#pd.qcut can create equal size buckets based on the sample quantiles
#pass bins=4 and labels=False to obtain the relevant quartile indices
quartiles_samp = pd.qcut(frame['data1'], 4, labels=False)

In [21]:
quartiles_samp.head()

0    1
1    2
2    2
3    0
4    3
Name: data1, dtype: int64

In [22]:
grouped = frame.groupby(quartiles_samp)

In [23]:
grouped

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fe0b0562590>

In [24]:
grouped.apply(get_stats)

Unnamed: 0_level_0,Unnamed: 1_level_0,min,max,count,mean
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,data1,-2.640178,-0.687926,250,-1.264451
0,data2,-2.787575,2.773949,250,-0.099105
1,data1,-0.687237,0.04229,250,-0.308973
1,data2,-3.216128,3.880993,250,0.122685
2,data1,0.043286,0.706961,250,0.350635
2,data2,-2.621028,2.947916,250,-0.025353
3,data1,0.708559,3.916392,250,1.294563
3,data2,-2.644741,3.18271,250,-0.036242


In [25]:
#Example: fill missing values with group-specific values.
#When we clean up missing data, we might want to drop some of the observations using dropna.
#We might aslo want to do the opposite. That is, filling values based on some criteria.
#fillna is the proper tool for this set of circumstances.
s = pd.Series(np.random.standard_normal(6))
s[::2] = np.nan

In [26]:
s

0         NaN
1   -1.763581
2         NaN
3    1.207878
4         NaN
5    0.541343
dtype: float64

In [27]:
s.fillna(s.mean())

0   -0.004787
1   -1.763581
2   -0.004787
3    1.207878
4   -0.004787
5    0.541343
dtype: float64

In [28]:
#We need the fill data to vary by group. We can group the data, and then use fillna and call apply(fillna) on each chunk.
#data: Sample data from states, with eastern and western regions.
states: List[str] = ['Ohio', 'New York', 'Vermont', 'Florida',
                    'Oregon', 'Nevada', 'California', 'Idaho']

In [29]:
group_key : List[str] = ['East', 'East', 'East', 'East',
                        'West', 'West', 'West', 'West']

In [30]:
data = pd.Series(np.random.standard_normal(8), index=states)

In [31]:
data

Ohio          1.569485
New York     -0.446465
Vermont      -1.478570
Florida       0.075603
Oregon        0.553807
Nevada       -0.628568
California    0.650744
Idaho        -0.639702
dtype: float64

In [32]:
#let's set some values to be missing
data[['Vermont', 'Nevada', 'Idaho']] = np.nan

In [33]:
data

Ohio          1.569485
New York     -0.446465
Vermont            NaN
Florida       0.075603
Oregon        0.553807
Nevada             NaN
California    0.650744
Idaho              NaN
dtype: float64

In [34]:
data.groupby(group_key).size()

East    4
West    4
dtype: int64

In [35]:
data.groupby(group_key).count()

East    3
West    2
dtype: int64

In [36]:
data.groupby(group_key).mean()

East    0.399541
West    0.602276
dtype: float64

In [37]:
#we can fill na values using group means
def fill_mean(group):
    return group.fillna(group.mean())

In [38]:
data.groupby(group_key, group_keys=False).apply(fill_mean)

Ohio          1.569485
New York     -0.446465
Vermont       0.399541
Florida       0.075603
Oregon        0.553807
Nevada        0.602276
California    0.650744
Idaho         0.602276
dtype: float64

In [39]:
#we can include the group keys here
data.groupby(group_key, group_keys=True).apply(fill_mean)

East  Ohio          1.569485
      New York     -0.446465
      Vermont       0.399541
      Florida       0.075603
West  Oregon        0.553807
      Nevada        0.602276
      California    0.650744
      Idaho         0.602276
dtype: float64

In [40]:
fill_values = {'East': 0.5, 'West': -1}

In [41]:
def fill_func(group):
    return group.fillna(fill_values[group.name])

In [42]:
data.groupby(group_key, group_keys=False).apply(fill_func)

Ohio          1.569485
New York     -0.446465
Vermont       0.500000
Florida       0.075603
Oregon        0.553807
Nevada       -1.000000
California    0.650744
Idaho        -1.000000
dtype: float64

In [43]:
#ramdom sampling and permutation
#construct a deck of English-style playing cards
suits : List[str] = ['H', 'S', 'C', 'D'] #Hearts, Spades, Clubs, Diamonds

In [44]:
#list(range(1, 11))

In [45]:
card_val = (list(range(1, 11)) + [10] *3) * 4
print(card_val) #a list of integers

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 10, 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 10, 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 10, 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 10, 10]


In [46]:
base_names = ['A'] + list(range(2, 11)) + ['J', 'K', 'Q']

In [47]:
cards0 = [[str(num)+suit] for suit in suits for num in base_names]

In [48]:
cards1 = functools.reduce(operator.add, cards0)

In [49]:
deck = pd.Series(card_val, index=cards1)

In [50]:
#Now we have a Series of length 52 whose index contains card names, and values are the ones used in
#blackjack
deck.head(13)

AH      1
2H      2
3H      3
4H      4
5H      5
6H      6
7H      7
8H      8
9H      9
10H    10
JH     10
KH     10
QH     10
dtype: int64

In [51]:
#based on what we say above, we could write a function to draw a card here
def draw(deck, n: int=5):
    return deck.sample(n)

In [52]:
draw(deck)

5S     5
9D     9
JD    10
9S     9
3D     3
dtype: int64

In [53]:
#suppose we wenated two random cards from each suit. Because the suit is the last character
#of each card name, we can group based on the suit and apply.
def get_suit(card):
    '''Takes a card and returns the suit'''
    #last letter is suit
    return card[-1]

In [54]:
deck.groupby(get_suit).apply(draw, n=2)

C  4C      4
   AC      1
D  10D    10
   5D      5
H  3H      3
   7H      7
S  8S      8
   3S      3
dtype: int64

In [55]:
#This has a multindex. We could also pass group_keys=False to just see the selected cards.
deck.groupby(get_suit, group_keys=False).apply(draw, n=2)

AC      1
2C      2
3D      3
9D      9
2H      2
5H      5
3S      3
10S    10
dtype: int64