# Groupby

## 基础

### 导入库

In [1]:
import numpy as np
import pandas as pd

### 生成数据

In [2]:
df1 = pd.DataFrame([('bird', 'Falconiformes', 389.0),
                    ('bird', 'Psittaciformes', 24.0),
                   ('mammal', 'Carnivora', 80.2),
                 ('mammal', 'Primates', np.nan),
                  ('mammal', 'Carnivora', 58)],
                  index=['falcon', 'parrot', 'lion', 'monkey', 'leopard'],
                  columns=('class', 'order', 'max_speed'))

In [3]:
df1

Unnamed: 0,class,order,max_speed
falcon,bird,Falconiformes,389.0
parrot,bird,Psittaciformes,24.0
lion,mammal,Carnivora,80.2
monkey,mammal,Primates,
leopard,mammal,Carnivora,58.0


### GroupBy对象

GroupBy对象是pandas.DataFrame.groupby或pandas.Series.groupby的返回值。\
pandas.DataFrame.groupby: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.groupby.html#pandas.DataFrame.groupby \
pandas.Series.groupby: https://pandas.pydata.org/docs/reference/api/pandas.Series.groupby.html#pandas.Series.groupby

#### 创建Groupby对象

创建Groupby对象需要我们建立一个映射关系(mapping)，即什么值对应什么分组。这种mapping可以是以下6种类型：

1.A Python function, to be called on each of the axis labels.\
2.A list or NumPy array of the same length as the selected axis.\
3.A dict or Series, providing a label -> group name mapping.\
4.For DataFrame objects, a string indicating a column to be used to group. Of course df.groupby('A') is just syntactic sugar for df.groupby(df['A']), but it makes life simpler.\
5.For DataFrame objects, a string indicating an index level to be used to group.\
6.A list of any of the above things.

1.根据一个函数分组。\
（1）axis=0(默认),该函数输入为DataFrame的index的一个值，返回的是分类标签。\
（2）axis=1，该函数输入为DataFrame的column的一个值，返回的是分类标签。

In [4]:
#bird分一组，mammal分一组
def classify_by_index(series):
    if series in ['parrot','falcon']:
        return 'bird'
    elif series in ['lion','monkey','leopard']:
        return 'mammal'

In [5]:
grouped_func1 = df1.groupby(classify_by_index,axis=0)  #axis=0可以省略，axis=0是默认值
#grouped_func1 = df1.groupby(classify_by_index,axis='index')

In [6]:
#可以看到分成两组
for name,group in grouped_func1:
    print(name)
    print(group)

bird
       class           order  max_speed
falcon  bird   Falconiformes      389.0
parrot  bird  Psittaciformes       24.0
mammal
          class      order  max_speed
lion     mammal  Carnivora       80.2
monkey   mammal   Primates        NaN
leopard  mammal  Carnivora       58.0


In [7]:
def classify_by_column(series):
    if series in ['class']:
        return 1
    elif series in ['order']:
        return 2
    elif series in ['max_speed']:
        return 3

In [8]:
grouped_func2 = df1.groupby(classify_by_column,axis=1)
#grouped_func2 = df1.groupby(classify_by_column,axis='column')

In [9]:
#按列分成三列
for name,group in grouped_func2:
    print(name)
    print(group)

1
          class
falcon     bird
parrot     bird
lion     mammal
monkey   mammal
leopard  mammal
2
                  order
falcon    Falconiformes
parrot   Psittaciformes
lion          Carnivora
monkey         Primates
leopard       Carnivora
3
         max_speed
falcon       389.0
parrot        24.0
lion          80.2
monkey         NaN
leopard       58.0


2.根据列表或numpy数组分组

In [10]:
list1 = [0,0,1,1,2]
grouped_list1 = df1.groupby(list1,axis=0)
#grouped_list1 = df1.groupby(list1,axis='index')

In [11]:
for name, group in grouped_list1:
    print(name)
    print(group)

0
       class           order  max_speed
falcon  bird   Falconiformes      389.0
parrot  bird  Psittaciformes       24.0
1
         class      order  max_speed
lion    mammal  Carnivora       80.2
monkey  mammal   Primates        NaN
2
          class      order  max_speed
leopard  mammal  Carnivora       58.0


In [12]:
list2 = [0,1,1]
grouped_list2 = df1.groupby(list2,axis=1)
#grouped_list2 = df1.groupby(list1,axis='column')

In [13]:
for name,group in grouped_list2:
    print(name)
    print(group)

0
          class
falcon     bird
parrot     bird
lion     mammal
monkey   mammal
leopard  mammal
1
                  order  max_speed
falcon    Falconiformes      389.0
parrot   Psittaciformes       24.0
lion          Carnivora       80.2
monkey         Primates        NaN
leopard       Carnivora       58.0


In [14]:
arr1 = np.array([0,0,1,1,2])
grouped_arr1 = df1.groupby(arr1,axis=0)
#grouped_arr1 = df1.groupby(arr1,axis='index')

In [15]:
for name,group in grouped_arr1:
    print(name)
    print(group)

0
       class           order  max_speed
falcon  bird   Falconiformes      389.0
parrot  bird  Psittaciformes       24.0
1
         class      order  max_speed
lion    mammal  Carnivora       80.2
monkey  mammal   Primates        NaN
2
          class      order  max_speed
leopard  mammal  Carnivora       58.0


3.根据字典或Series分组\
（1）axis=0(默认),该字典的key(Series的index)为DataFrame的index的一个值，值是分类标签。\
（2）axis=1，该字典的key(Series的index)为DataFrame的column的一个值，值是分类标签。

In [16]:
df2 = pd.DataFrame({'A':[1,2,3,4,5,6,7,8,9,10],'B':[11,12,13,14,15,16,17,18,19,20],'C':[21,22,23,24,25,26,27,28,29,30]})

In [17]:
dict1 = {0:'a',1:'b',2:'a',3:'b',4:'a',5:'b',6:'a',7:'b',8:'a',9:'b'}

In [18]:
grouped_dict1 = df2.groupby(dict1,axis=0)
#grouped_dict1 = df2.groupby(dict1,axis='index')

In [19]:
for name, group in grouped_dict1:
    print(name)
    print(group)

a
   A   B   C
0  1  11  21
2  3  13  23
4  5  15  25
6  7  17  27
8  9  19  29
b
    A   B   C
1   2  12  22
3   4  14  24
5   6  16  26
7   8  18  28
9  10  20  30


In [20]:
dict2 = {'A':'a','B':'a','C':'b'}

In [21]:
grouped_dict2 = df2.groupby(dict2,axis=1)
#grouped_dict2 = df2.groupby(dict2,axis='column')

In [22]:
for name, group in grouped_dict2:
    print(name)
    print(group)

a
    A   B
0   1  11
1   2  12
2   3  13
3   4  14
4   5  15
5   6  16
6   7  17
7   8  18
8   9  19
9  10  20
b
    C
0  21
1  22
2  23
3  24
4  25
5  26
6  27
7  28
8  29
9  30


In [23]:
series1 = pd.Series({0:'a',1:'b',2:'a',3:'b',4:'a',5:'b',6:'a',7:'b',8:'a',9:'b'})  #字典转化为pandas.Series

In [24]:
grouped_series1 = df2.groupby(series1,axis=0)
#grouped_series1 = df2.groupby(series1,axis='index')

In [25]:
for name, group in grouped_series1:
    print(name)
    print(group)

a
   A   B   C
0  1  11  21
2  3  13  23
4  5  15  25
6  7  17  27
8  9  19  29
b
    A   B   C
1   2  12  22
3   4  14  24
5   6  16  26
7   8  18  28
9  10  20  30


In [26]:
series2 = pd.Series({'A':'a','B':'a','C':'b'})

In [27]:
grouped_series2 = df2.groupby(series2,axis=1)
#grouped_series2 = df2.groupby(series2,axis='column')

In [28]:
for name,group in grouped_series2:
    print(name)
    print(group)

a
    A   B
0   1  11
1   2  12
2   3  13
3   4  14
4   5  15
5   6  16
6   7  17
7   8  18
8   9  19
9  10  20
b
    C
0  21
1  22
2  23
3  24
4  25
5  26
6  27
7  28
8  29
9  30


4.对于DataFrame来说，按列名进行分组

In [29]:
df1

Unnamed: 0,class,order,max_speed
falcon,bird,Falconiformes,389.0
parrot,bird,Psittaciformes,24.0
lion,mammal,Carnivora,80.2
monkey,mammal,Primates,
leopard,mammal,Carnivora,58.0


In [30]:
grouped_class = df1.groupby('class',axis=0)
#grouped_class = df1.groupby('class',axis='index')

In [31]:
for name,group in grouped_class:
    print(name)
    print(group)

bird
       class           order  max_speed
falcon  bird   Falconiformes      389.0
parrot  bird  Psittaciformes       24.0
mammal
          class      order  max_speed
lion     mammal  Carnivora       80.2
monkey   mammal   Primates        NaN
leopard  mammal  Carnivora       58.0


In [32]:
grouped_order = df1.groupby('order',axis=0)
#grouped_order = df1.groupby('order',axis='index')

In [33]:
for name,group in grouped_order:
    print(name)
    print(group)

Carnivora
          class      order  max_speed
lion     mammal  Carnivora       80.2
leopard  mammal  Carnivora       58.0
Falconiformes
       class          order  max_speed
falcon  bird  Falconiformes      389.0
Primates
         class     order  max_speed
monkey  mammal  Primates        NaN
Psittaciformes
       class           order  max_speed
parrot  bird  Psittaciformes       24.0


5.当然，这种取列的方式也可以直接用某列的Series，但实际上，这种方式和之前的Series分组是一样。

In [34]:
grouped_order_series = df1.groupby(df1['order'],axis=0)
#grouped_order_series = df1.groupby(df1['order'],axis='index')

In [35]:
for name,group in grouped_order_series:
    print(name)
    print(group)

Carnivora
          class      order  max_speed
lion     mammal  Carnivora       80.2
leopard  mammal  Carnivora       58.0
Falconiformes
       class          order  max_speed
falcon  bird  Falconiformes      389.0
Primates
         class     order  max_speed
monkey  mammal  Primates        NaN
Psittaciformes
       class           order  max_speed
parrot  bird  Psittaciformes       24.0


6.用列表来分组

用列表分组的最简单的方式就是在DataFrame中取多列

In [None]:
import numpy as np

In [44]:
grade_ = [1,1,2,3,3,2,6,6,5,5,4,4,3,5,6,2,4,4,5,4,3,2,4,3,2,6,5,3,3,5]  #年级
class_ = [1,3,6,4,2,5,5,4,3,4,2,2,2,1,1,6,5,4,3,3,2,1,5,4,3,4,2,1,4,5]  #班级
score_ = np.random.randint(85,95,30)                                    #分数
df2 = pd.DataFrame({'grade':grade_,'class':class_,'score':score_})
grouped2 = df2.groupby(['grade','class'])                                #按年级和班级两列分组才能唯一确定一个班级

#### groupby对象的属性和简单的方法

##### groupby对象是迭代器(有\_\_iter\_\_方法)，所以可以进行循环

In [45]:
for name,group in grouped2:
    print('name: ',name)
    print('group: ',group)

name:  (1, 1)
group:     grade  class  score
0      1      1     86
name:  (1, 3)
group:     grade  class  score
1      1      3     85
name:  (2, 1)
group:      grade  class  score
21      2      1     90
name:  (2, 3)
group:      grade  class  score
24      2      3     93
name:  (2, 5)
group:     grade  class  score
5      2      5     93
name:  (2, 6)
group:      grade  class  score
2       2      6     91
15      2      6     93
name:  (3, 1)
group:      grade  class  score
27      3      1     94
name:  (3, 2)
group:      grade  class  score
4       3      2     89
12      3      2     91
20      3      2     89
name:  (3, 4)
group:      grade  class  score
3       3      4     89
23      3      4     89
28      3      4     91
name:  (4, 2)
group:      grade  class  score
10      4      2     94
11      4      2     92
name:  (4, 3)
group:      grade  class  score
19      4      3     89
name:  (4, 4)
group:      grade  class  score
17      4      4     87
name:  (4, 5)
group:  

##### groups属性，Groupby对象转化为字典，字典的key是name, 字典的value是group label。group label是一个排序（行的位置）。

In [51]:
grouped2.groups

{(1, 1): [0], (1, 3): [1], (2, 1): [21], (2, 3): [24], (2, 5): [5], (2, 6): [2, 15], (3, 1): [27], (3, 2): [4, 12, 20], (3, 4): [3, 23, 28], (4, 2): [10, 11], (4, 3): [19], (4, 4): [17], (4, 5): [16, 22], (5, 1): [13], (5, 2): [26], (5, 3): [8, 18], (5, 4): [9], (5, 5): [29], (6, 1): [14], (6, 4): [7, 25], (6, 5): [6]}

##### indices属性，Groupby对象转化为字典，字典的key是name, 字典的value是group indice。group indice是行的索引。

In [53]:
grouped2.indices

{(1, 1): array([0], dtype=int64),
 (1, 3): array([1], dtype=int64),
 (2, 1): array([21], dtype=int64),
 (2, 3): array([24], dtype=int64),
 (2, 5): array([5], dtype=int64),
 (2, 6): array([ 2, 15], dtype=int64),
 (3, 1): array([27], dtype=int64),
 (3, 2): array([ 4, 12, 20], dtype=int64),
 (3, 4): array([ 3, 23, 28], dtype=int64),
 (4, 2): array([10, 11], dtype=int64),
 (4, 3): array([19], dtype=int64),
 (4, 4): array([17], dtype=int64),
 (4, 5): array([16, 22], dtype=int64),
 (5, 1): array([13], dtype=int64),
 (5, 2): array([26], dtype=int64),
 (5, 3): array([ 8, 18], dtype=int64),
 (5, 4): array([9], dtype=int64),
 (5, 5): array([29], dtype=int64),
 (6, 1): array([14], dtype=int64),
 (6, 4): array([ 7, 25], dtype=int64),
 (6, 5): array([6], dtype=int64)}

##### get_group方法，参数是name，即组名，返回值是该组的内容

In [54]:
grouped2.get_group((1,1))

Unnamed: 0,grade,class,score
0,1,1,86


In [55]:
grouped2.get_group((3,2))

Unnamed: 0,grade,class,score
4,3,2,89
12,3,2,91
20,3,2,89


### Grouper对象
注：Groupby对象和Grouper对象是不同的，要注意。
网址：https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Grouper.html#pandas.Grouper

In [62]:
df3 = pd.DataFrame(
    {
        "Animal": ["Falcon", "Parrot", "Falcon", "Falcon", "Parrot"],
        "Speed": [100, 5, 200, 300, 15],
    }
)

In [63]:
df3

Unnamed: 0,Animal,Speed
0,Falcon,100
1,Parrot,5
2,Falcon,200
3,Falcon,300
4,Parrot,15


In [64]:
pd.Grouper(key="Animal")

Grouper(key='Animal', axis=0, sort=False)

In [65]:
df3.groupby(pd.Grouper(key="Animal")).mean()

Unnamed: 0_level_0,Speed
Animal,Unnamed: 1_level_1
Falcon,200
Parrot,10


In [66]:
df3.groupby(['Animal']).mean()

Unnamed: 0_level_0,Speed
Animal,Unnamed: 1_level_1
Falcon,200
Parrot,10


In [68]:
df3 = pd.DataFrame(
   {
       "Publish date": [
            pd.Timestamp("2000-01-02"),
            pd.Timestamp("2000-01-02"),
            pd.Timestamp("2000-01-09"),
            pd.Timestamp("2000-01-16")
        ],
        "ID": [0, 1, 2, 3],
        "Price": [10, 20, 30, 40]
    }
)

In [69]:
df3

Unnamed: 0,Publish date,ID,Price
0,2000-01-02,0,10
1,2000-01-02,1,20
2,2000-01-09,2,30
3,2000-01-16,3,40


In [71]:
df3.groupby(pd.Grouper(key="Publish date", freq="1W")).mean()   #针对datetime和Timestamp，可以根据时间的频率分组

Unnamed: 0_level_0,ID,Price
Publish date,Unnamed: 1_level_1,Unnamed: 2_level_1
2000-01-02,0.5,15.0
2000-01-09,2.0,30.0
2000-01-16,3.0,40.0


In [72]:
type(df3['Publish date'].iloc[0])

pandas._libs.tslibs.timestamps.Timestamp

In [73]:
start, end = '2000-10-01 23:30:00', '2000-10-02 00:30:00'
rng = pd.date_range(start, end, freq='7min')
ts = pd.Series(np.arange(len(rng)) * 3, index=rng)

In [74]:
ts.groupby(pd.Grouper(freq='17min')).sum()

2000-10-01 23:14:00     0
2000-10-01 23:31:00     9
2000-10-01 23:48:00    21
2000-10-02 00:05:00    54
2000-10-02 00:22:00    24
Freq: 17T, dtype: int32

In [75]:
ts.groupby(pd.Grouper(freq='17min', origin='epoch')).sum()

2000-10-01 23:18:00     0
2000-10-01 23:35:00    18
2000-10-01 23:52:00    27
2000-10-02 00:09:00    39
2000-10-02 00:26:00    24
Freq: 17T, dtype: int32

In [79]:
ts.groupby(pd.Grouper(freq='17min', origin='2000-01-01')).sum()

2000-10-01 23:24:00     3
2000-10-01 23:41:00    15
2000-10-01 23:58:00    45
2000-10-02 00:15:00    45
Freq: 17T, dtype: int32

In [80]:
ts.groupby(pd.Grouper(freq='17min', origin='start')).sum()

2000-10-01 23:30:00     9
2000-10-01 23:47:00    21
2000-10-02 00:04:00    54
2000-10-02 00:21:00    24
Freq: 17T, dtype: int32

In [81]:
ts.groupby(pd.Grouper(freq='17min', offset='23h30min')).sum()

2000-10-01 23:30:00     9
2000-10-01 23:47:00    21
2000-10-02 00:04:00    54
2000-10-02 00:21:00    24
Freq: 17T, dtype: int32

In [82]:
ts.groupby(pd.Grouper(freq='17min', offset='2min')).sum()

2000-10-01 23:16:00     0
2000-10-01 23:33:00     9
2000-10-01 23:50:00    36
2000-10-02 00:07:00    39
2000-10-02 00:24:00    24
Freq: 17T, dtype: int32

### Groupby的内置函数方法
分组的作用一般是用来进行分组汇总，先分组，再对每一个分组求一个统计量。

#### 创建DataFrame

In [83]:
df = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
                          'foo', 'bar', 'foo', 'foo'],
                    'B': ['one', 'one', 'two', 'three',
                          'two', 'two', 'one', 'three'],
                    'C': np.random.randn(8),
                    'D': np.random.randn(8)})

In [84]:
df

Unnamed: 0,A,B,C,D
0,foo,one,1.791228,-0.249905
1,bar,one,1.269206,0.956345
2,foo,two,0.715012,1.645568
3,bar,three,1.510267,0.519068
4,foo,two,0.3847,-0.852861
5,bar,two,0.448319,0.023689
6,foo,one,0.197989,0.002997
7,foo,three,-0.095859,0.807703


#### 分组求和
分组是按行分组，求和是按列求和。

In [85]:
df.groupby('A').sum()  #只对数值列求和

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,3.227793,1.499102
foo,2.99307,1.353501


In [87]:
for name,group in df.groupby('A'):
    print(name)
    print(group)

bar
     A      B         C         D
1  bar    one  1.269206  0.956345
3  bar  three  1.510267  0.519068
5  bar    two  0.448319  0.023689
foo
     A      B         C         D
0  foo    one  1.791228 -0.249905
2  foo    two  0.715012  1.645568
4  foo    two  0.384700 -0.852861
6  foo    one  0.197989  0.002997
7  foo  three -0.095859  0.807703


In [88]:
df.groupby(['A', 'B']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,1.269206,0.956345
bar,three,1.510267,0.519068
bar,two,0.448319,0.023689
foo,one,1.989217,-0.246908
foo,three,-0.095859,0.807703
foo,two,1.099712,0.792707


#### 分组求第一个数

In [89]:
df.groupby('A').first()

Unnamed: 0_level_0,B,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,1.269206,0.956345
foo,one,1.791228,-0.249905


#### 分组求最后一个数

In [90]:
df.groupby('A').last()

Unnamed: 0_level_0,B,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,two,0.448319,0.023689
foo,three,-0.095859,0.807703


#### 分组求平均值

In [91]:
df.groupby('A').mean()

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,1.075931,0.499701
foo,0.598614,0.2707


#### 分组求每组行数

In [92]:
df.groupby('A').size()

A
bar    3
foo    5
dtype: int64

In [93]:
df.groupby(['A','B']).size()

A    B    
bar  one      1
     three    1
     two      1
foo  one      2
     three    1
     two      2
dtype: int64

In [94]:
df.groupby('A').count()

Unnamed: 0_level_0,B,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,3,3,3
foo,5,5,5


In [95]:
df.groupby(['A','B']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,1,1
bar,three,1,1
bar,two,1,1
foo,one,2,2
foo,three,1,1
foo,two,2,2


#### 分组求标准差

In [96]:
df.groupby('A').std()

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,0.556731,0.46663
foo,0.728771,0.972589


#### 分组求方差

In [97]:
df.groupby('A').var()

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,0.30995,0.217744
foo,0.531108,0.945929


#### 分组求最小值

In [98]:
df.groupby('A').min()

Unnamed: 0_level_0,B,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,0.448319,0.023689
foo,one,-0.095859,-0.852861


#### 分组求最大值

In [99]:
df.groupby('A').max()

Unnamed: 0_level_0,B,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,two,1.510267,0.956345
foo,two,1.791228,1.645568


In [104]:
df.groupby('A').get_group('bar')

Unnamed: 0,A,B,C,D
1,bar,one,1.269206,0.956345
3,bar,three,1.510267,0.519068
5,bar,two,0.448319,0.023689


In [106]:
df.groupby('A').get_group('foo')

Unnamed: 0,A,B,C,D
0,foo,one,1.791228,-0.249905
2,foo,two,0.715012,1.645568
4,foo,two,0.3847,-0.852861
6,foo,one,0.197989,0.002997
7,foo,three,-0.095859,0.807703


#### 取第n个数或若干几个数

In [105]:
df.groupby('A').nth(0) #取第1个数

Unnamed: 0_level_0,B,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,1.269206,0.956345
foo,one,1.791228,-0.249905


In [107]:
df.groupby('A').nth(1)  #取第2个数

Unnamed: 0_level_0,B,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,three,1.510267,0.519068
foo,two,0.715012,1.645568


In [108]:
df.groupby('A').nth([3,4]) #取第4个和第5个数

Unnamed: 0_level_0,B,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
foo,one,0.197989,0.002997
foo,three,-0.095859,0.807703


In [109]:
df.groupby('A').nth(-1)  #取最后一个数

Unnamed: 0_level_0,B,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,two,0.448319,0.023689
foo,three,-0.095859,0.807703


#### 分组描述性统计

In [112]:
df.groupby('A').describe()

Unnamed: 0_level_0,C,C,C,C,C,C,C,C,D,D,D,D,D,D,D,D
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
A,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
bar,3.0,1.075931,0.556731,0.448319,0.858763,1.269206,1.389737,1.510267,3.0,0.499701,0.46663,0.023689,0.271378,0.519068,0.737706,0.956345
foo,5.0,0.598614,0.728771,-0.095859,0.197989,0.3847,0.715012,1.791228,5.0,0.2707,0.972589,-0.852861,-0.249905,0.002997,0.807703,1.645568


In [111]:
df.groupby(['A','B']).describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,C,C,C,C,C,C,C,C,D,D,D,D,D,D,D,D
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
A,B,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2
bar,one,1.0,1.269206,,1.269206,1.269206,1.269206,1.269206,1.269206,1.0,0.956345,,0.956345,0.956345,0.956345,0.956345,0.956345
bar,three,1.0,1.510267,,1.510267,1.510267,1.510267,1.510267,1.510267,1.0,0.519068,,0.519068,0.519068,0.519068,0.519068,0.519068
bar,two,1.0,0.448319,,0.448319,0.448319,0.448319,0.448319,0.448319,1.0,0.023689,,0.023689,0.023689,0.023689,0.023689,0.023689
foo,one,2.0,0.994609,1.126591,0.197989,0.596299,0.994609,1.392919,1.791228,2.0,-0.123454,0.178828,-0.249905,-0.18668,-0.123454,-0.060229,0.002997
foo,three,1.0,-0.095859,,-0.095859,-0.095859,-0.095859,-0.095859,-0.095859,1.0,0.807703,,0.807703,0.807703,0.807703,0.807703,0.807703
foo,two,2.0,0.549856,0.233566,0.3847,0.467278,0.549856,0.632434,0.715012,2.0,0.396353,1.766656,-0.852861,-0.228254,0.396353,1.020961,1.645568


#### 分组求standard error

In [114]:
df.groupby('A').sem()

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,0.321429,0.269409
foo,0.325916,0.434955


In [113]:
df.groupby(['A','B']).sem()

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,,
bar,three,,
bar,two,,
foo,one,0.79662,0.126451
foo,three,,
foo,two,0.165156,1.249214


### Groupby的apply方法