# Group operations - split-apply-combine
- grouping key can take many forms:
 - index or columns
 - keys do not have to be of same type
 - a list/array same length as grouping axis
 - a dictionary/Series corresponding to values on grouping axis
 - a function to be invoked on the axis or individual labels on the index

In [57]:
import pandas as pd
import numpy as np

In [91]:

np.random.seed(42)
df = pd.DataFrame({"key1" : ["a", "a", None, "b", "b", "a", None],
"key2" : pd.Series([1, 2, 1, 2, 1, None, 1], dtype="Int64"),
"data1" : np.random.standard_normal(7),
"data2" : np.random.standard_normal(7)})
df

Unnamed: 0,key1,key2,data1,data2
0,a,1.0,0.496714,0.767435
1,a,2.0,-0.138264,-0.469474
2,,1.0,0.647689,0.54256
3,b,2.0,1.52303,-0.463418
4,b,1.0,-0.234153,-0.46573
5,a,,-0.234137,0.241962
6,,1.0,1.579213,-1.91328


In [93]:
# df = df.set_index(keys="key1")
# df.index.name = "key1"
df.groupby(by="key1", dropna=False).mean()

Unnamed: 0_level_0,key2,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1.5,0.041438,0.179974
b,1.5,0.644438,-0.464574
,1.0,1.113451,-0.68536


In [71]:
grouped = df["data1"].groupby(df["key1"])
grouped.mean()

key1
None    1.113451
a       0.041438
b       0.644438
Name: data1, dtype: float64

In [73]:
mean_df = df.groupby(by=["key1", "key2"]).mean()

In [90]:
mean_df

data,data1,data1,data2,data2
key2,1,2,1,2
key1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
,1.113451,,-0.68536,
a,0.496714,-0.138264,0.767435,-0.469474
b,-0.234153,1.52303,-0.46573,-0.463418


In [78]:
mean_df = mean_df.unstack()

In [79]:
mean_df.columns.nlevels

2

In [80]:
mean_df

Unnamed: 0_level_0,data1,data1,data2,data2
key2,1,2,1,2
key1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
,1.113451,,-0.68536,
a,0.496714,-0.138264,0.767435,-0.469474
b,-0.234153,1.52303,-0.46573,-0.463418


In [86]:
mean_df.columns.names

FrozenList([None, 'key2'])

In [87]:
mean_df.columns.names = ["data", "key2"]
mean_df

data,data1,data1,data2,data2
key2,1,2,1,2
key1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
,1.113451,,-0.68536,
a,0.496714,-0.138264,0.767435,-0.469474
b,-0.234153,1.52303,-0.46573,-0.463418


In [88]:
mean_df.swaplevel(i="data", j="key2", axis=1)

key2,1,2,1,2
data,data1,data1,data2,data2
key1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
,1.113451,,-0.68536,
a,0.496714,-0.138264,0.767435,-0.469474
b,-0.234153,1.52303,-0.46573,-0.463418


In [89]:
df.groupby(["key1", "key2"]).size()

key1  key2
None  1       2
a     1       1
      2       1
b     1       1
      2       1
dtype: int64

- `count` computes number of non-null values in group

In [98]:
df.groupby(["key1"], dropna=False).count()

Unnamed: 0_level_0,key2,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,2,3,3
b,2,2,2
,2,2,2


In [100]:
for key1, group in df.groupby("key1"):
    print(key1)
    print(group)

a
  key1  key2     data1     data2
0    a     1  0.496714  0.767435
1    a     2 -0.138264 -0.469474
5    a  <NA> -0.234137  0.241962
b
  key1  key2     data1     data2
3    b     2  1.523030 -0.463418
4    b     1 -0.234153 -0.465730


In [101]:
for (key1, key2), group in df.groupby(["key1", "key2"]):
    print(key1, key2)
    print(group)

a 1
  key1  key2     data1     data2
0    a     1  0.496714  0.767435
a 2
  key1  key2     data1     data2
1    a     2 -0.138264 -0.469474
b 1
  key1  key2     data1    data2
4    b     1 -0.234153 -0.46573
b 2
  key1  key2    data1     data2
3    b     2  1.52303 -0.463418


In [102]:
pieces = {name: group for name, group in df.groupby(["key1", "key2"])}
pieces

{('a',
  1):   key1  key2     data1     data2
 0    a     1  0.496714  0.767435,
 ('a',
  2):   key1  key2     data1     data2
 1    a     2 -0.138264 -0.469474,
 ('b',
  1):   key1  key2     data1    data2
 4    b     1 -0.234153 -0.46573,
 ('b',
  2):   key1  key2    data1     data2
 3    b     2  1.52303 -0.463418}

- grouping on `columns` axis

In [106]:
grouped_cols = df.groupby({"key1": "key", "key2": "key", "data1": "data", "data2": "data"}, axis=1)
for name, group in grouped_cols:
    print(name)
    print(group)

data
      data1     data2
0  0.496714  0.767435
1 -0.138264 -0.469474
2  0.647689  0.542560
3  1.523030 -0.463418
4 -0.234153 -0.465730
5 -0.234137  0.241962
6  1.579213 -1.913280
key
   key1  key2
0     a     1
1     a     2
2  None     1
3     b     2
4     b     1
5     a  <NA>
6  None     1


In [107]:
s_grouped = df.groupby(["key1", "key2"])["data1"]

In [111]:
print(*s_grouped)

(('a', 1), 0    0.496714
Name: data1, dtype: float64) (('a', 2), 1   -0.138264
Name: data1, dtype: float64) (('b', 1), 4   -0.234153
Name: data1, dtype: float64) (('b', 2), 3    1.52303
Name: data1, dtype: float64)


 - grouping by `dictionary` and `Series`

In [112]:
people = pd.DataFrame(np.random.standard_normal((5, 5)),
columns=["a", "b", "c", "d", "e"],
index=["Joe", "Steve", "Wanda", "Jill", "Trey"])

people.iloc[2:3, [1, 2]] = np.nan # Add a few NA values
 

In [114]:
people

Unnamed: 0,a,b,c,d,e
Joe,-1.724918,-0.562288,-1.012831,0.314247,-0.908024
Steve,-1.412304,1.465649,-0.225776,0.067528,-1.424748
Wanda,-0.544383,,,0.375698,-0.600639
Jill,-0.291694,-0.601707,1.852278,-0.013497,-1.057711
Trey,0.822545,-1.220844,0.208864,-1.95967,-1.328186


In [117]:
grp_mapping = {"a": "red",
"b": "red", 
"c": "orange",
"d": "orange",
"e": "red",
"f": "green" } # unused group mappings are ok
by_col = people.groupby(grp_mapping, axis=1, dropna=False)
by_col.mean()

Unnamed: 0,orange,red
Joe,-0.349292,-1.065076
Steve,-0.079124,-0.457134
Wanda,0.375698,-0.572511
Jill,0.91939,-0.65037
Trey,-0.875403,-0.575495


In [118]:
map_series = pd.Series(grp_mapping)
map_series

a       red
b       red
c    orange
d    orange
e       red
f     green
dtype: object

In [120]:

by_col = people.groupby(grp_mapping, axis=1, dropna=False)
by_col.mean()
by_col.count()

Unnamed: 0,orange,red
Joe,2,3
Steve,2,3
Wanda,1,2
Jill,2,3
Trey,2,3


- by `functions`

In [122]:
# group over all people with same length of names
people.groupby(len).count() # computes len on each index value and return value used as index

Unnamed: 0,a,b,c,d,e
3,1,1,1,1,1
4,2,2,2,2,2
5,2,1,1,2,2


- mix arrays with functions

In [123]:
key_list = ["one", "one", "one", "two", "two"]
people.groupby([len, key_list]).count()

Unnamed: 0,Unnamed: 1,a,b,c,d,e
3,one,1,1,1,1,1
4,two,2,2,2,2,2
5,one,2,1,1,2,2


In [124]:
people

Unnamed: 0,a,b,c,d,e
Joe,-1.724918,-0.562288,-1.012831,0.314247,-0.908024
Steve,-1.412304,1.465649,-0.225776,0.067528,-1.424748
Wanda,-0.544383,,,0.375698,-0.600639
Jill,-0.291694,-0.601707,1.852278,-0.013497,-1.057711
Trey,0.822545,-1.220844,0.208864,-1.95967,-1.328186


- groupby `index levels`

In [125]:
columns = pd.MultiIndex.from_arrays([["US", "US", "US", "JP", "JP"],
[1, 3, 5, 1, 3]],
names=["cty", "tenor"])
hier_df = pd.DataFrame(np.random.standard_normal((4, 5)), columns = columns)
hier_df


cty,US,US,US,JP,JP
tenor,1,3,5,1,3
0,0.196861,0.738467,0.171368,-0.115648,-0.301104
1,-1.478522,-0.719844,-0.460639,1.057122,0.343618
2,-1.76304,0.324084,-0.385082,-0.676922,0.611676
3,1.031,0.93128,-0.839218,-0.309212,0.331263


In [130]:
hier_df.groupby(level=1, axis="columns").mean()

tenor,1,3,5
0,0.040606,0.218681,0.171368
1,-0.2107,-0.188113,-0.460639
2,-1.219981,0.46788,-0.385082
3,0.360894,0.631272,-0.839218


# Data Aggregation