In [1]:
import numpy as np
import pandas as pd

# 1.groupby方法：

In [2]:
df = pd.DataFrame({"key1" : ["a", "a",  "b", "b", "a", "b"],
                   "key2" : pd.Series([1, 2, 1, 2, 1, 1], dtype="Int64"),
                   "data1" : [1, 2, 3, 4, 5, 6],
                   "data2" : [7, 8, 9, 10, 11, 12]})

df

Unnamed: 0,key1,key2,data1,data2
0,a,1,1,7
1,a,2,2,8
2,b,1,3,9
3,b,2,4,10
4,a,1,5,11
5,b,1,6,12


### 用key1的标签来计算data1列的平均值

In [3]:
grouped = df["data1"].groupby(df["key1"])
grouped

<pandas.core.groupby.generic.SeriesGroupBy object at 0x000001F80FFCEA90>

In [4]:
grouped.mean()

key1
a    2.666667
b    4.333333
Name: data1, dtype: float64

### 使用两个键对数据进行分组

In [5]:
res = df["data1"].groupby([df["key1"], df["key2"]]).mean()
res

key1  key2
a     1       3.0
      2       2.0
b     1       4.5
      2       4.0
Name: data1, dtype: float64

In [6]:
res.unstack()

key2,1,2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,3.0,2.0
b,4.5,4.0


### 分组键也可以是正确长度的任意数组

In [7]:
X1 = np.array(["AA", "AA", "BB", "BB", "BB", "AA"])
X2 = [2000, 2000, 2020, 2000, 2020, 2020]

In [8]:
df["data1"].groupby([X1,X2]).mean()

AA  2000    1.5
    2020    6.0
BB  2000    4.0
    2020    4.0
Name: data1, dtype: float64

### 递列名作为分组键

In [9]:
df.groupby("key1").mean()

Unnamed: 0_level_0,key2,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1.333333,2.666667,8.666667
b,1.333333,4.333333,10.333333


In [10]:
# 因为 df["key1"] 不是数值数据，所以它被认为是一个冗余列，因此会自动从结果中排除
df.groupby("key2").mean()

Unnamed: 0_level_0,data1,data2
key2,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3.75,9.75
2,3.0,9.0


In [11]:
df.groupby(["key1", "key2"]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,3.0,9.0
a,2,2.0,8.0
b,1,4.5,10.5
b,2,4.0,10.0


### size方法

In [12]:
df.groupby(["key1", "key2"]).size()

key1  key2
a     1       2
      2       1
b     1       2
      2       1
dtype: int64

In [13]:
df.groupby(["key1", "key2"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,2,2
a,2,1,1
b,1,2,2
b,2,1,1


### dropna=False

In [14]:
df2 = pd.DataFrame({"key1" : ["a", None,  "b", "b", None, "b"],
                   "key2" : pd.Series([1, 2, None, 2, 1, 1], dtype="Int64"),
                   "data1" : [1, 2, 3, 4, 5, 6],
                   "data2" : [7, 8, 9, 10, 11, 12]})

df2

Unnamed: 0,key1,key2,data1,data2
0,a,1.0,1,7
1,,2.0,2,8
2,b,,3,9
3,b,2.0,4,10
4,,1.0,5,11
5,b,1.0,6,12


In [15]:
df2.groupby(["key1", "key2"]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,1.0,7.0
b,1,6.0,12.0
b,2,4.0,10.0


In [16]:
df2.groupby(["key1", "key2"]).size()

key1  key2
a     1       1
b     1       1
      2       1
dtype: int64

# 2.遍历各分组：

In [17]:
df

Unnamed: 0,key1,key2,data1,data2
0,a,1,1,7
1,a,2,2,8
2,b,1,3,9
3,b,2,4,10
4,a,1,5,11
5,b,1,6,12


### groupby 返回的对象支持迭代，生成包含组名和数据块的二维元组序列

In [18]:
for name, group in df.groupby("key1"):
    print(name)
    print(group)

a
  key1  key2  data1  data2
0    a     1      1      7
1    a     2      2      8
4    a     1      5     11
b
  key1  key2  data1  data2
2    b     1      3      9
3    b     2      4     10
5    b     1      6     12


### 可以生成组名和数据块组成的字典，然后选中任一数据块进行​你想要的操作。

In [19]:
pieces = {name: group for name, group in df.groupby("key1")}
pieces

{'a':   key1  key2  data1  data2
 0    a     1      1      7
 1    a     2      2      8
 4    a     1      5     11,
 'b':   key1  key2  data1  data2
 2    b     1      3      9
 3    b     2      4     10
 5    b     1      6     12}

In [20]:
pieces["b"]

Unnamed: 0,key1,key2,data1,data2
2,b,1,3,9
3,b,2,4,10
5,b,1,6,12


### 在有多个分组键的情况下，元组中的第一个元素将是一个键值的元组

In [21]:
for (k1, k2), group in df.groupby(["key1", "key2"]):
    print((k1, k2))
    print(group)

('a', 1)
  key1  key2  data1  data2
0    a     1      1      7
4    a     1      5     11
('a', 2)
  key1  key2  data1  data2
1    a     2      2      8
('b', 1)
  key1  key2  data1  data2
2    b     1      3      9
5    b     1      6     12
('b', 2)
  key1  key2  data1  data2
3    b     2      4     10


### 默认情况下，groupby 在 axis="index" 上分组，但你可以在其他轴上分组。例如，我们可以在这里根据示例 df 的列是以“key”还是“data”开头对它们进行分组

In [22]:
grouped = df.groupby({"key1": "key", "key2": "key","data1": "data", "data2": "data"}, axis="columns")

In [23]:
for group_key, group_values in grouped:
    print(group_key)
    print(group_values)

data
   data1  data2
0      1      7
1      2      8
2      3      9
3      4     10
4      5     11
5      6     12
key
  key1  key2
0    a     1
1    a     2
2    b     1
3    b     2
4    a     1
5    b     1


In [24]:
grouped.count()

Unnamed: 0,data,key
0,2,2
1,2,2
2,2,2
3,2,2
4,2,2
5,2,2
