### 1. 根据某一列或多列分组

In [2]:
from pandas import DataFrame, Series
import pandas as pd
import numpy as np

data_frame = DataFrame({
    'key1': ['a', 'a', 'b', 'b', 'a'],
    'key2': ['one', 'two', 'one', 'two', 'three'],
    'data1': np.arange(5),
    'data2': np.arange(10, 15)
})

data_frame

Unnamed: 0,data1,data2,key1,key2
0,0,10,a,one
1,1,11,a,two
2,2,12,b,one
3,3,13,b,two
4,4,14,a,three


In [3]:
# 按照key1对数据进行分组
data_gruop_by_key1 = data_frame.groupby('key1')

# 输出a、b两组数据
dict([group for group in data_gruop_by_key1])['a']

Unnamed: 0,data1,data2,key1,key2
0,0,10,a,one
1,1,11,a,two
4,4,14,a,three


In [4]:
dict([group for group in data_gruop_by_key1])['b']

Unnamed: 0,data1,data2,key1,key2
2,2,12,b,one
3,3,13,b,two


In [6]:
# 计算每一组的平均值
print(data_gruop_by_key1.mean())
print('-'*30)

# 每一组元素个数
print(data_gruop_by_key1.size())
print('-'*30)

         data1      data2
key1                     
a     1.666667  11.666667
b     2.500000  12.500000
------------------------------
key1
a    3
b    2
dtype: int64
------------------------------


In [7]:
# 对某一列数据进行分组
data1_gruop_by_key1 = data_frame['data1'].groupby(data_frame['key1'])
print(data1_gruop_by_key1.mean())

key1
a    1.666667
b    2.500000
Name: data1, dtype: float64


In [8]:
data_frame = DataFrame({
    'key1': ['a', 'a', 'b', 'b', 'a'],
    'key2': ['one', 'one', 'one', 'two', 'three'],
    'data1': np.arange(5),
    'data2': np.arange(10, 15)
})

data_frame

Unnamed: 0,data1,data2,key1,key2
0,0,10,a,one
1,1,11,a,one
2,2,12,b,one
3,3,13,b,two
4,4,14,a,three


In [9]:
# 按照key1、key2进行分组
data_gruop_by_key12 = data_frame.groupby(['key1', 'key2'])
data_gruop_by_key12.sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,1,21
a,three,4,14
b,one,2,12
b,two,3,13


### 2. 根据索引值分组

In [13]:
data_frame = DataFrame(
    np.random.randint(10, 13, (6, 6)),
    index=list('aabbcc'),
    columns=list('112233')
)

data_frame

Unnamed: 0,1,1.1,2,2.1,3,3.1
a,11,11,11,11,10,12
a,11,11,11,10,10,11
b,10,11,11,11,10,12
b,11,11,10,12,12,12
c,12,11,12,11,12,12
c,11,11,11,11,10,12


In [14]:
# 根据列索引分组, 并求每一组平均值
data_frame.groupby(level=0, axis=1).mean()

Unnamed: 0,1,2,3
a,11.0,11.0,11.0
a,11.0,10.5,10.5
b,10.5,11.0,11.0
b,11.0,11.0,12.0
c,11.5,11.5,12.0
c,11.0,11.0,11.0


In [16]:
# 根据行索引分组, 并求每一组平均值
data_frame.groupby(level=0, axis=0).mean()

Unnamed: 0,1,1.1,2,2.1,3,3.1
a,11.0,11.0,11.0,10.5,10.0,11.5
b,10.5,11.0,10.5,11.5,11.0,12.0
c,11.5,11.0,11.5,11.0,11.0,12.0


###  3. 手动指定索引进行分组

In [19]:
data_frame = DataFrame(
    np.arange(36).reshape((6, 6)), 
    columns=list('abcdef')
)

data_frame

Unnamed: 0,a,b,c,d,e,f
0,0,1,2,3,4,5
1,6,7,8,9,10,11
2,12,13,14,15,16,17
3,18,19,20,21,22,23
4,24,25,26,27,28,29
5,30,31,32,33,34,35


In [21]:
# 按照列进行分组
# 也就是手动指定哪些索引为一组
group_mapping = {
    'a': 'first',
    'b': 'first',
    'c': 'first',
    'd': 'second',
    'e': 'third',
    'f': 'third',
}

data_group_by_dict = data_frame.groupby(group_mapping, axis=1)

# 输出三组信息
print(dict([x for x in data_group_by_dict])['first'])
print('-'*13)
print(dict([x for x in data_group_by_dict])['second'])
print('-'*13)
print(dict([x for x in data_group_by_dict])['third'])

    a   b   c
0   0   1   2
1   6   7   8
2  12  13  14
3  18  19  20
4  24  25  26
5  30  31  32
-------------
    d
0   3
1   9
2  15
3  21
4  27
5  33
-------------
    e   f
0   4   5
1  10  11
2  16  17
3  22  23
4  28  29
5  34  35


In [22]:
# 对每一组进行求和计算
print(data_group_by_dict.sum())

   first  second  third
0      3       3      9
1     21       9     21
2     39      15     33
3     57      21     45
4     75      27     57
5     93      33     69


In [224]:
# 将行索引分组
data_frame = DataFrame(
    np.arange(36).reshape((6, 6)), 
    index=list('abcdef')
)

print(data_frame)

    0   1   2   3   4   5
a   0   1   2   3   4   5
b   6   7   8   9  10  11
c  12  13  14  15  16  17
d  18  19  20  21  22  23
e  24  25  26  27  28  29
f  30  31  32  33  34  35


In [222]:
group_mapping = {
    'a': 'first',
    'b': 'first',
    'c': 'first',
    'd': 'second',
    'e': 'third',
    'f': 'third',
}

data_group_by_dict = data_frame.groupby(group_mapping, axis=0)

# 输出三组信息
print(dict([x for x in data_group_by_dict])['first'])
print('-'*25)
print(dict([x for x in data_group_by_dict])['second'])
print('-'*25)
print(dict([x for x in data_group_by_dict])['third'])

data_group_by_dict.sum()

    0   1   2   3   4   5
a   0   1   2   3   4   5
b   6   7   8   9  10  11
c  12  13  14  15  16  17
-------------------------
    0   1   2   3   4   5
d  18  19  20  21  22  23
-------------------------
    0   1   2   3   4   5
e  24  25  26  27  28  29
f  30  31  32  33  34  35


Unnamed: 0,0,1,2,3,4,5
first,18,21,24,27,30,33
second,18,19,20,21,22,23
third,54,56,58,60,62,64
