# Group operations - split-apply-combine
- grouping key can take many forms:
 - index or columns
 - keys do not have to be of same type
 - a list/array same length as grouping axis
 - a dictionary/Series corresponding to values on grouping axis
 - a function to be invoked on the axis or individual labels on the index

In [57]:
import pandas as pd
import numpy as np

In [91]:

np.random.seed(42)
df = pd.DataFrame({"key1" : ["a", "a", None, "b", "b", "a", None],
"key2" : pd.Series([1, 2, 1, 2, 1, None, 1], dtype="Int64"),
"data1" : np.random.standard_normal(7),
"data2" : np.random.standard_normal(7)})
df

Unnamed: 0,key1,key2,data1,data2
0,a,1.0,0.496714,0.767435
1,a,2.0,-0.138264,-0.469474
2,,1.0,0.647689,0.54256
3,b,2.0,1.52303,-0.463418
4,b,1.0,-0.234153,-0.46573
5,a,,-0.234137,0.241962
6,,1.0,1.579213,-1.91328


In [93]:
# df = df.set_index(keys="key1")
# df.index.name = "key1"
df.groupby(by="key1", dropna=False).mean()

Unnamed: 0_level_0,key2,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1.5,0.041438,0.179974
b,1.5,0.644438,-0.464574
,1.0,1.113451,-0.68536


In [71]:
grouped = df["data1"].groupby(df["key1"])
grouped.mean()

key1
None    1.113451
a       0.041438
b       0.644438
Name: data1, dtype: float64

In [73]:
mean_df = df.groupby(by=["key1", "key2"]).mean()

In [90]:
mean_df

data,data1,data1,data2,data2
key2,1,2,1,2
key1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
,1.113451,,-0.68536,
a,0.496714,-0.138264,0.767435,-0.469474
b,-0.234153,1.52303,-0.46573,-0.463418


In [78]:
mean_df = mean_df.unstack()

In [79]:
mean_df.columns.nlevels

2

In [80]:
mean_df

Unnamed: 0_level_0,data1,data1,data2,data2
key2,1,2,1,2
key1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
,1.113451,,-0.68536,
a,0.496714,-0.138264,0.767435,-0.469474
b,-0.234153,1.52303,-0.46573,-0.463418


In [86]:
mean_df.columns.names

FrozenList([None, 'key2'])

In [87]:
mean_df.columns.names = ["data", "key2"]
mean_df

data,data1,data1,data2,data2
key2,1,2,1,2
key1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
,1.113451,,-0.68536,
a,0.496714,-0.138264,0.767435,-0.469474
b,-0.234153,1.52303,-0.46573,-0.463418


In [88]:
mean_df.swaplevel(i="data", j="key2", axis=1)

key2,1,2,1,2
data,data1,data1,data2,data2
key1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
,1.113451,,-0.68536,
a,0.496714,-0.138264,0.767435,-0.469474
b,-0.234153,1.52303,-0.46573,-0.463418


In [89]:
df.groupby(["key1", "key2"]).size()

key1  key2
None  1       2
a     1       1
      2       1
b     1       1
      2       1
dtype: int64

- `count` computes number of non-null values in group

In [98]:
df.groupby(["key1"], dropna=False).count()

Unnamed: 0_level_0,key2,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,2,3,3
b,2,2,2
,2,2,2


In [100]:
for key1, group in df.groupby("key1"):
    print(key1)
    print(group)

a
  key1  key2     data1     data2
0    a     1  0.496714  0.767435
1    a     2 -0.138264 -0.469474
5    a  <NA> -0.234137  0.241962
b
  key1  key2     data1     data2
3    b     2  1.523030 -0.463418
4    b     1 -0.234153 -0.465730


In [101]:
for (key1, key2), group in df.groupby(["key1", "key2"]):
    print(key1, key2)
    print(group)

a 1
  key1  key2     data1     data2
0    a     1  0.496714  0.767435
a 2
  key1  key2     data1     data2
1    a     2 -0.138264 -0.469474
b 1
  key1  key2     data1    data2
4    b     1 -0.234153 -0.46573
b 2
  key1  key2    data1     data2
3    b     2  1.52303 -0.463418


In [102]:
pieces = {name: group for name, group in df.groupby(["key1", "key2"])}
pieces

{('a',
  1):   key1  key2     data1     data2
 0    a     1  0.496714  0.767435,
 ('a',
  2):   key1  key2     data1     data2
 1    a     2 -0.138264 -0.469474,
 ('b',
  1):   key1  key2     data1    data2
 4    b     1 -0.234153 -0.46573,
 ('b',
  2):   key1  key2    data1     data2
 3    b     2  1.52303 -0.463418}

- grouping on `columns` axis

In [106]:
grouped_cols = df.groupby({"key1": "key", "key2": "key", "data1": "data", "data2": "data"}, axis=1)
for name, group in grouped_cols:
    print(name)
    print(group)

data
      data1     data2
0  0.496714  0.767435
1 -0.138264 -0.469474
2  0.647689  0.542560
3  1.523030 -0.463418
4 -0.234153 -0.465730
5 -0.234137  0.241962
6  1.579213 -1.913280
key
   key1  key2
0     a     1
1     a     2
2  None     1
3     b     2
4     b     1
5     a  <NA>
6  None     1


In [107]:
s_grouped = df.groupby(["key1", "key2"])["data1"]

In [111]:
print(*s_grouped)

(('a', 1), 0    0.496714
Name: data1, dtype: float64) (('a', 2), 1   -0.138264
Name: data1, dtype: float64) (('b', 1), 4   -0.234153
Name: data1, dtype: float64) (('b', 2), 3    1.52303
Name: data1, dtype: float64)


 - grouping by `dictionary` and `Series`

In [112]:
people = pd.DataFrame(np.random.standard_normal((5, 5)),
columns=["a", "b", "c", "d", "e"],
index=["Joe", "Steve", "Wanda", "Jill", "Trey"])

people.iloc[2:3, [1, 2]] = np.nan # Add a few NA values
 

In [114]:
people

Unnamed: 0,a,b,c,d,e
Joe,-1.724918,-0.562288,-1.012831,0.314247,-0.908024
Steve,-1.412304,1.465649,-0.225776,0.067528,-1.424748
Wanda,-0.544383,,,0.375698,-0.600639
Jill,-0.291694,-0.601707,1.852278,-0.013497,-1.057711
Trey,0.822545,-1.220844,0.208864,-1.95967,-1.328186


In [117]:
grp_mapping = {"a": "red",
"b": "red", 
"c": "orange",
"d": "orange",
"e": "red",
"f": "green" } # unused group mappings are ok
by_col = people.groupby(grp_mapping, axis=1, dropna=False)
by_col.mean()

Unnamed: 0,orange,red
Joe,-0.349292,-1.065076
Steve,-0.079124,-0.457134
Wanda,0.375698,-0.572511
Jill,0.91939,-0.65037
Trey,-0.875403,-0.575495


In [118]:
map_series = pd.Series(grp_mapping)
map_series

a       red
b       red
c    orange
d    orange
e       red
f     green
dtype: object

In [120]:

by_col = people.groupby(grp_mapping, axis=1, dropna=False)
by_col.mean()
by_col.count()

Unnamed: 0,orange,red
Joe,2,3
Steve,2,3
Wanda,1,2
Jill,2,3
Trey,2,3


- by `functions`

In [122]:
# group over all people with same length of names
people.groupby(len).count() # computes len on each index value and return value used as index

Unnamed: 0,a,b,c,d,e
3,1,1,1,1,1
4,2,2,2,2,2
5,2,1,1,2,2


- mix arrays with functions

In [123]:
key_list = ["one", "one", "one", "two", "two"]
people.groupby([len, key_list]).count()

Unnamed: 0,Unnamed: 1,a,b,c,d,e
3,one,1,1,1,1,1
4,two,2,2,2,2,2
5,one,2,1,1,2,2


In [124]:
people

Unnamed: 0,a,b,c,d,e
Joe,-1.724918,-0.562288,-1.012831,0.314247,-0.908024
Steve,-1.412304,1.465649,-0.225776,0.067528,-1.424748
Wanda,-0.544383,,,0.375698,-0.600639
Jill,-0.291694,-0.601707,1.852278,-0.013497,-1.057711
Trey,0.822545,-1.220844,0.208864,-1.95967,-1.328186


- groupby `index levels`

In [125]:
columns = pd.MultiIndex.from_arrays([["US", "US", "US", "JP", "JP"],
[1, 3, 5, 1, 3]],
names=["cty", "tenor"])
hier_df = pd.DataFrame(np.random.standard_normal((4, 5)), columns = columns)
hier_df


cty,US,US,US,JP,JP
tenor,1,3,5,1,3
0,0.196861,0.738467,0.171368,-0.115648,-0.301104
1,-1.478522,-0.719844,-0.460639,1.057122,0.343618
2,-1.76304,0.324084,-0.385082,-0.676922,0.611676
3,1.031,0.93128,-0.839218,-0.309212,0.331263


In [130]:
hier_df.groupby(level=1, axis="columns").mean()

tenor,1,3,5
0,0.040606,0.218681,0.171368
1,-0.2107,-0.188113,-0.460639
2,-1.219981,0.46788,-0.385082
3,0.360894,0.631272,-0.839218


# Data Aggregation

In [138]:
df

Unnamed: 0,key1,key2,data1,data2
0,a,1.0,0.496714,0.767435
1,a,2.0,-0.138264,-0.469474
2,,1.0,0.647689,0.54256
3,b,2.0,1.52303,-0.463418
4,b,1.0,-0.234153,-0.46573
5,a,,-0.234137,0.241962
6,,1.0,1.579213,-1.91328


In [141]:
grouped = df.groupby("key1")

def max_plus1(arr):
    return arr.max() + 1

grouped.apply(max_plus1)

Unnamed: 0_level_0,key2,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,3.0,1.496714,1.767435
b,3.0,2.52303,0.536582


In [142]:
grouped.agg(max_plus1)

Unnamed: 0_level_0,key2,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,3,1.496714,1.767435
b,3,2.52303,0.536582


In [143]:
grouped.describe()

Unnamed: 0_level_0,key2,key2,key2,key2,key2,key2,key2,key2,data1,data1,data1,data1,data1,data2,data2,data2,data2,data2,data2,data2,data2
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
key1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
a,2.0,1.5,0.707107,1.0,1.25,1.5,1.75,2.0,3.0,0.041438,...,0.179225,0.496714,3.0,0.179974,0.62078,-0.469474,-0.113756,0.241962,0.504699,0.767435
b,2.0,1.5,0.707107,1.0,1.25,1.5,1.75,2.0,2.0,0.644438,...,1.083734,1.52303,2.0,-0.464574,0.001635,-0.46573,-0.465152,-0.464574,-0.463996,-0.463418


In [144]:
grouped.mean()

Unnamed: 0_level_0,key2,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1.5,0.041438,0.179974
b,1.5,0.644438,-0.464574


# Column-wise and Multiple function application

In [145]:
tips = pd.read_csv("examples/tips.csv")
tips

Unnamed: 0,total_bill,tip,smoker,day,time,size
0,16.99,1.01,No,Sun,Dinner,2
1,10.34,1.66,No,Sun,Dinner,3
2,21.01,3.50,No,Sun,Dinner,3
3,23.68,3.31,No,Sun,Dinner,2
4,24.59,3.61,No,Sun,Dinner,4
...,...,...,...,...,...,...
239,29.03,5.92,No,Sat,Dinner,3
240,27.18,2.00,Yes,Sat,Dinner,2
241,22.67,2.00,Yes,Sat,Dinner,2
242,17.82,1.75,No,Sat,Dinner,2


In [146]:
tips_grp = tips.groupby(["smoker", "day"])
tips_grp.describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,total_bill,total_bill,total_bill,total_bill,total_bill,total_bill,total_bill,tip,tip,tip,tip,tip,size,size,size,size,size,size,size,size
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
smoker,day,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
No,Fri,4.0,18.42,5.059282,12.46,15.1,19.235,22.555,22.75,4.0,2.8125,...,3.3125,3.5,4.0,2.25,0.5,2.0,2.0,2.0,2.25,3.0
No,Sat,45.0,19.661778,8.939181,7.25,14.73,17.82,20.65,48.33,45.0,3.102889,...,3.39,9.0,45.0,2.555556,0.78496,1.0,2.0,2.0,3.0,4.0
No,Sun,57.0,20.506667,8.130189,8.77,14.78,18.43,25.0,48.17,57.0,3.167895,...,3.92,6.0,57.0,2.929825,1.032674,2.0,2.0,3.0,4.0,6.0
No,Thur,45.0,17.113111,7.721728,7.51,11.69,15.95,20.27,41.19,45.0,2.673778,...,3.0,6.7,45.0,2.488889,1.179796,1.0,2.0,2.0,2.0,6.0
Yes,Fri,15.0,16.813333,9.086388,5.75,11.69,13.42,18.665,40.17,15.0,2.714,...,3.24,4.73,15.0,2.066667,0.593617,1.0,2.0,2.0,2.0,4.0
Yes,Sat,42.0,21.276667,10.069138,3.07,13.405,20.39,26.7925,50.81,42.0,2.875476,...,3.1975,10.0,42.0,2.47619,0.862161,1.0,2.0,2.0,3.0,5.0
Yes,Sun,19.0,24.12,10.442511,7.25,17.165,23.1,32.375,45.35,19.0,3.516842,...,4.0,6.5,19.0,2.578947,0.901591,2.0,2.0,2.0,3.0,5.0
Yes,Thur,17.0,19.190588,8.355149,10.34,13.51,16.47,19.81,43.11,17.0,3.03,...,4.0,5.0,17.0,2.352941,0.701888,2.0,2.0,2.0,2.0,4.0


In [147]:
tips_grp.agg(["mean", "std", max_plus1])

  tips_grp.agg(["mean", "std", max_plus1])


Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,total_bill,total_bill,tip,tip,tip,size,size,size
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,max_plus1,mean,std,max_plus1,mean,std,max_plus1
smoker,day,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
No,Fri,18.42,5.059282,23.75,2.8125,0.898494,4.5,2.25,0.5,4
No,Sat,19.661778,8.939181,49.33,3.102889,1.642088,10.0,2.555556,0.78496,5
No,Sun,20.506667,8.130189,49.17,3.167895,1.224785,7.0,2.929825,1.032674,7
No,Thur,17.113111,7.721728,42.19,2.673778,1.282964,7.7,2.488889,1.179796,7
Yes,Fri,16.813333,9.086388,41.17,2.714,1.077668,5.73,2.066667,0.593617,5
Yes,Sat,21.276667,10.069138,51.81,2.875476,1.63058,11.0,2.47619,0.862161,6
Yes,Sun,24.12,10.442511,46.35,3.516842,1.261151,7.5,2.578947,0.901591,6
Yes,Thur,19.190588,8.355149,44.11,3.03,1.113491,6.0,2.352941,0.701888,5


In [152]:
tips_grp.agg([("average", "mean"), ("std-dev", lambda x: x.std())])

  tips_grp.agg([("average", "mean"), ("std-dev", lambda x: x.std())])


Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,total_bill,tip,tip,size,size
Unnamed: 0_level_1,Unnamed: 1_level_1,average,std-dev,average,std-dev,average,std-dev
smoker,day,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
No,Fri,18.42,5.059282,2.8125,0.898494,2.25,0.5
No,Sat,19.661778,8.939181,3.102889,1.642088,2.555556,0.78496
No,Sun,20.506667,8.130189,3.167895,1.224785,2.929825,1.032674
No,Thur,17.113111,7.721728,2.673778,1.282964,2.488889,1.179796
Yes,Fri,16.813333,9.086388,2.714,1.077668,2.066667,0.593617
Yes,Sat,21.276667,10.069138,2.875476,1.63058,2.47619,0.862161
Yes,Sun,24.12,10.442511,3.516842,1.261151,2.578947,0.901591
Yes,Thur,19.190588,8.355149,3.03,1.113491,2.352941,0.701888


- customs function for each column

In [160]:
tips_grp.agg({"total_bill": [("bill_max", "max"), "mean"], "time": ["count", np.unique], "tip": ["mean", "min"]})

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,total_bill,time,time,tip,tip
Unnamed: 0_level_1,Unnamed: 1_level_1,bill_max,mean,count,unique,mean,min
smoker,day,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
No,Fri,22.75,18.42,4,"[Dinner, Lunch]",2.8125,1.5
No,Sat,48.33,19.661778,45,[Dinner],3.102889,1.0
No,Sun,48.17,20.506667,57,[Dinner],3.167895,1.01
No,Thur,41.19,17.113111,45,"[Dinner, Lunch]",2.673778,1.25
Yes,Fri,40.17,16.813333,15,"[Dinner, Lunch]",2.714,1.0
Yes,Sat,50.81,21.276667,42,[Dinner],2.875476,1.0
Yes,Sun,45.35,24.12,19,[Dinner],3.516842,1.5
Yes,Thur,43.11,19.190588,17,[Lunch],3.03,2.0


In [166]:
# disable index
tips.groupby(["day", "smoker"], as_index=False).agg(["mean", "std"]) 

  tips.groupby(["day", "smoker"], as_index=False).agg(["mean", "std"])


Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,total_bill,tip,tip,size,size
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std,mean,std
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Fri,No,18.42,5.059282,2.8125,0.898494,2.25,0.5
Fri,Yes,16.813333,9.086388,2.714,1.077668,2.066667,0.593617
Sat,No,19.661778,8.939181,3.102889,1.642088,2.555556,0.78496
Sat,Yes,21.276667,10.069138,2.875476,1.63058,2.47619,0.862161
Sun,No,20.506667,8.130189,3.167895,1.224785,2.929825,1.032674
Sun,Yes,24.12,10.442511,3.516842,1.261151,2.578947,0.901591
Thur,No,17.113111,7.721728,2.673778,1.282964,2.488889,1.179796
Thur,Yes,19.190588,8.355149,3.03,1.113491,2.352941,0.701888
