# Groupby

## 基础

### 导入库

In [30]:
import numpy as np
import pandas as pd

### 生成数据

In [31]:
df1 = pd.DataFrame([('bird', 'Falconiformes', 389.0),
                    ('bird', 'Psittaciformes', 24.0),
                   ('mammal', 'Carnivora', 80.2),
                 ('mammal', 'Primates', np.nan),
                  ('mammal', 'Carnivora', 58)],
                  index=['falcon', 'parrot', 'lion', 'monkey', 'leopard'],
                  columns=('class', 'order', 'max_speed'))

In [32]:
df1

Unnamed: 0,class,order,max_speed
falcon,bird,Falconiformes,389.0
parrot,bird,Psittaciformes,24.0
lion,mammal,Carnivora,80.2
monkey,mammal,Primates,
leopard,mammal,Carnivora,58.0


### GroupBy对象

GroupBy对象是pandas.DataFrame.groupby或pandas.Series.groupby的返回值。\
pandas.DataFrame.groupby: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.groupby.html#pandas.DataFrame.groupby \
pandas.Series.groupby: https://pandas.pydata.org/docs/reference/api/pandas.Series.groupby.html#pandas.Series.groupby

#### 创建Groupby对象

创建Groupby对象需要我们建立一个映射关系(mapping)，即什么值对应什么分组。这种mapping可以是以下6种类型：

1.A Python function, to be called on each of the axis labels.\
2.A list or NumPy array of the same length as the selected axis.\
3.A dict or Series, providing a label -> group name mapping.\
4.For DataFrame objects, a string indicating a column to be used to group. Of course df.groupby('A') is just syntactic sugar for df.groupby(df['A']), but it makes life simpler.\
5.For DataFrame objects, a string indicating an index level to be used to group.\
6.A list of any of the above things.

1.根据一个函数分组。\
（1）axis=0(默认),该函数输入为DataFrame的index的一个值，返回的是分类标签。\
（2）axis=1，该函数输入为DataFrame的column的一个值，返回的是分类标签。

In [33]:
#bird分一组，mammal分一组
def classify_by_index(series):
    if series in ['parrot','falcon']:
        return 'bird'
    elif series in ['lion','monkey','leopard']:
        return 'mammal'

In [84]:
grouped_func1 = df1.groupby(classify_by_index,axis=0)  #axis=0可以省略，axis=0是默认值
#grouped_func1 = df1.groupby(classify_by_index,axis='index')

In [85]:
#可以看到分成两组
for name,group in grouped_func1:
    print(name)
    print(group)

bird
       class           order  max_speed
falcon  bird   Falconiformes      389.0
parrot  bird  Psittaciformes       24.0
mammal
          class      order  max_speed
lion     mammal  Carnivora       80.2
monkey   mammal   Primates        NaN
leopard  mammal  Carnivora       58.0


In [36]:
def classify_by_column(series):
    if series in ['class']:
        return 1
    elif series in ['order']:
        return 2
    elif series in ['max_speed']:
        return 3

In [37]:
grouped_func2 = df1.groupby(classify_by_column,axis=1)
#grouped_func2 = df1.groupby(classify_by_column,axis='column')

In [38]:
#按列分成三列
for name,group in grouped_func2:
    print(name)
    print(group)

1
          class
falcon     bird
parrot     bird
lion     mammal
monkey   mammal
leopard  mammal
2
                  order
falcon    Falconiformes
parrot   Psittaciformes
lion          Carnivora
monkey         Primates
leopard       Carnivora
3
         max_speed
falcon       389.0
parrot        24.0
lion          80.2
monkey         NaN
leopard       58.0


2.根据列表或numpy数组分组

In [39]:
list1 = [0,0,1,1,2]
grouped_list1 = df1.groupby(list1,axis=0)
#grouped_list1 = df1.groupby(list1,axis='index')

In [40]:
for name, group in grouped_list1:
    print(name)
    print(group)

0
       class           order  max_speed
falcon  bird   Falconiformes      389.0
parrot  bird  Psittaciformes       24.0
1
         class      order  max_speed
lion    mammal  Carnivora       80.2
monkey  mammal   Primates        NaN
2
          class      order  max_speed
leopard  mammal  Carnivora       58.0


In [41]:
list2 = [0,1,1]
grouped_list2 = df1.groupby(list2,axis=1)
#grouped_list2 = df1.groupby(list1,axis='column')

In [42]:
for name,group in grouped_list2:
    print(name)
    print(group)

0
          class
falcon     bird
parrot     bird
lion     mammal
monkey   mammal
leopard  mammal
1
                  order  max_speed
falcon    Falconiformes      389.0
parrot   Psittaciformes       24.0
lion          Carnivora       80.2
monkey         Primates        NaN
leopard       Carnivora       58.0


In [43]:
arr1 = np.array([0,0,1,1,2])
grouped_arr1 = df1.groupby(arr1,axis=0)
#grouped_arr1 = df1.groupby(arr1,axis='index')

In [44]:
for name,group in grouped_arr1:
    print(name)
    print(group)

0
       class           order  max_speed
falcon  bird   Falconiformes      389.0
parrot  bird  Psittaciformes       24.0
1
         class      order  max_speed
lion    mammal  Carnivora       80.2
monkey  mammal   Primates        NaN
2
          class      order  max_speed
leopard  mammal  Carnivora       58.0


3.根据字典或Series分组\
（1）axis=0(默认),该字典的key(Series的index)为DataFrame的index的一个值，值是分类标签。\
（2）axis=1，该字典的key(Series的index)为DataFrame的column的一个值，值是分类标签。

In [56]:
df2 = pd.DataFrame({'A':[1,2,3,4,5,6,7,8,9,10],'B':[11,12,13,14,15,16,17,18,19,20],'C':[21,22,23,24,25,26,27,28,29,30]})

In [57]:
dict1 = {0:'a',1:'b',2:'a',3:'b',4:'a',5:'b',6:'a',7:'b',8:'a',9:'b'}

In [58]:
grouped_dict1 = df2.groupby(dict1,axis=0)
#grouped_dict1 = df2.groupby(dict1,axis='index')

In [59]:
for name, group in grouped_dict1:
    print(name)
    print(group)

a
   A   B   C
0  1  11  21
2  3  13  23
4  5  15  25
6  7  17  27
8  9  19  29
b
    A   B   C
1   2  12  22
3   4  14  24
5   6  16  26
7   8  18  28
9  10  20  30


In [60]:
dict2 = {'A':'a','B':'a','C':'b'}

In [61]:
grouped_dict2 = df2.groupby(dict2,axis=1)
#grouped_dict2 = df2.groupby(dict2,axis='column')

In [62]:
for name, group in grouped_dict2:
    print(name)
    print(group)

a
    A   B
0   1  11
1   2  12
2   3  13
3   4  14
4   5  15
5   6  16
6   7  17
7   8  18
8   9  19
9  10  20
b
    C
0  21
1  22
2  23
3  24
4  25
5  26
6  27
7  28
8  29
9  30


In [63]:
series1 = pd.Series({0:'a',1:'b',2:'a',3:'b',4:'a',5:'b',6:'a',7:'b',8:'a',9:'b'})  #字典转化为pandas.Series

In [67]:
grouped_series1 = df2.groupby(series1,axis=0)
#grouped_series1 = df2.groupby(series1,axis='index')

In [68]:
for name, group in grouped_series1:
    print(name)
    print(group)

a
   A   B   C
0  1  11  21
2  3  13  23
4  5  15  25
6  7  17  27
8  9  19  29
b
    A   B   C
1   2  12  22
3   4  14  24
5   6  16  26
7   8  18  28
9  10  20  30


In [69]:
series2 = pd.Series({'A':'a','B':'a','C':'b'})

In [71]:
grouped_series2 = df2.groupby(series2,axis=1)
#grouped_series2 = df2.groupby(series2,axis='column')

In [72]:
for name,group in grouped_series2:
    print(name)
    print(group)

a
    A   B
0   1  11
1   2  12
2   3  13
3   4  14
4   5  15
5   6  16
6   7  17
7   8  18
8   9  19
9  10  20
b
    C
0  21
1  22
2  23
3  24
4  25
5  26
6  27
7  28
8  29
9  30


4.对于DataFrame来说，按列名进行分组

In [73]:
df1

Unnamed: 0,class,order,max_speed
falcon,bird,Falconiformes,389.0
parrot,bird,Psittaciformes,24.0
lion,mammal,Carnivora,80.2
monkey,mammal,Primates,
leopard,mammal,Carnivora,58.0


In [80]:
grouped_class = df1.groupby('class',axis=0)
#grouped_class = df1.groupby('class',axis='index')

In [81]:
for name,group in grouped_class:
    print(name)
    print(group)

bird
       class           order  max_speed
falcon  bird   Falconiformes      389.0
parrot  bird  Psittaciformes       24.0
mammal
          class      order  max_speed
lion     mammal  Carnivora       80.2
monkey   mammal   Primates        NaN
leopard  mammal  Carnivora       58.0


In [82]:
grouped_order = df1.groupby('order',axis=0)
#grouped_order = df1.groupby('order',axis='index')

In [83]:
for name,group in grouped_order:
    print(name)
    print(group)

Carnivora
          class      order  max_speed
lion     mammal  Carnivora       80.2
leopard  mammal  Carnivora       58.0
Falconiformes
       class          order  max_speed
falcon  bird  Falconiformes      389.0
Primates
         class     order  max_speed
monkey  mammal  Primates        NaN
Psittaciformes
       class           order  max_speed
parrot  bird  Psittaciformes       24.0


当然，这种取列的方式也可以直接用某列的Series，但实际上，这种方式和之前的Series分组是一样。

In [88]:
grouped_order_series = df1.groupby(df1['order'],axis=0)
#grouped_order_series = df1.groupby(df1['order'],axis='index')

In [90]:
for name,group in grouped_order_series:
    print(name)
    print(group)

Carnivora
          class      order  max_speed
lion     mammal  Carnivora       80.2
leopard  mammal  Carnivora       58.0
Falconiformes
       class          order  max_speed
falcon  bird  Falconiformes      389.0
Primates
         class     order  max_speed
monkey  mammal  Primates        NaN
Psittaciformes
       class           order  max_speed
parrot  bird  Psittaciformes       24.0
