In [1]:
import numpy as np
import pandas as pd

# 1.选择一列或列的子集：

In [2]:
df = pd.DataFrame({"key1" : ["a", "a",  "b", "b", "a", "b"],
                   "key2" : pd.Series([1, 2, 1, 2, 1, 1], dtype="Int64"),
                   "data1" : [1, 2, 3, 4, 5, 6],
                   "data2" : [7, 8, 9, 10, 11, 12]})

df

Unnamed: 0,key1,key2,data1,data2
0,a,1,1,7
1,a,2,2,8
2,b,1,3,9
3,b,2,4,10
4,a,1,5,11
5,b,1,6,12


### 如果传递的是列表或数组，则此索引操作返回的对象是分组的 DataFrame

In [3]:
df.groupby(["key1", "key2"])[["data2"]].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,1,9.0
a,2,8.0
b,1,10.5
b,2,10.0


### 如果仅将单个列名作为标量传递，则返回的对象是分组的 Series

In [4]:
s_grouped = df.groupby(["key1", "key2"])["data2"]

In [5]:
s_grouped

<pandas.core.groupby.generic.SeriesGroupBy object at 0x000001C2CE29CBE0>

In [6]:
s_grouped.mean()

key1  key2
a     1        9.0
      2        8.0
b     1       10.5
      2       10.0
Name: data2, dtype: float64

# 2.用字典或Series分组：

In [7]:
df = pd.DataFrame(np.arange(25).reshape(5,5),index=["Tom","Steve","Herry","Bill","Wanda"],columns=["data1","data2","data3","data4","data5"])
df.iloc[2:3, [1, 2]] = np.nan
df

Unnamed: 0,data1,data2,data3,data4,data5
Tom,0,1.0,2.0,3,4
Steve,5,6.0,7.0,8,9
Herry,10,,,13,14
Bill,15,16.0,17.0,18,19
Wanda,20,21.0,22.0,23,24


### 假设有各列的分组对应关系mapping ，并且想按组对列求和

In [8]:
mapping = {"data1": "a", "data2": "a", "data3": "b","data4": "b", "data5": "a", "data6" : "c"}

In [9]:
by_column = df.groupby(mapping, axis="columns")

In [10]:
by_column.sum()

Unnamed: 0,a,b
Tom,5.0,5.0
Steve,20.0,15.0
Herry,24.0,13.0
Bill,50.0,35.0
Wanda,65.0,45.0


### 相同的功能适用于Series，可以将其视为固定大小的映射

In [11]:
map_series = pd.Series(mapping)

In [12]:
map_series

data1    a
data2    a
data3    b
data4    b
data5    a
data6    c
dtype: object

In [13]:
df.groupby(map_series, axis="columns").sum()

Unnamed: 0,a,b
Tom,5.0,5.0
Steve,20.0,15.0
Herry,24.0,13.0
Bill,50.0,35.0
Wanda,65.0,45.0


In [14]:
df.groupby(map_series, axis="columns").count()

Unnamed: 0,a,b
Tom,3,2
Steve,3,2
Herry,2,1
Bill,3,2
Wanda,3,2


# 3.使用函数分组：

In [15]:
df = pd.DataFrame(np.arange(25).reshape(5,5),index=["Tom","Steve","Herry","Bill","Wanda"],columns=["data1","data2","data3","data4","data5"])
df.iloc[2:3, [1, 2]] = np.nan
df

Unnamed: 0,data1,data2,data3,data4,data5
Tom,0,1.0,2.0,3,4
Steve,5,6.0,7.0,8,9
Herry,10,,,13,14
Bill,15,16.0,17.0,18,19
Wanda,20,21.0,22.0,23,24


### 名字长度分组

In [16]:
df.groupby(len).sum()

Unnamed: 0,data1,data2,data3,data4,data5
3,0,1.0,2.0,3,4
4,15,16.0,17.0,18,19
5,35,27.0,29.0,44,47


### 将函数与数组、字典或Series等混合

In [17]:
key_list = ["one", "one", "one", "two", "two"]

In [18]:
df.groupby([len, key_list]).sum()

Unnamed: 0,Unnamed: 1,data1,data2,data3,data4,data5
3,one,0,1.0,2.0,3,4
4,two,15,16.0,17.0,18,19
5,one,15,6.0,7.0,21,23
5,two,20,21.0,22.0,23,24


# 4.根据索引层级分组：

In [19]:
columns = pd.MultiIndex.from_arrays([["L1","L1","L1","L2","L2"],["a","b","c","d","e"]],names=["level","data"])

df = pd.DataFrame(np.arange(25).reshape(5,5),columns=columns)

df

level,L1,L1,L1,L2,L2
data,a,b,c,d,e
0,0,1,2,3,4
1,5,6,7,8,9
2,10,11,12,13,14
3,15,16,17,18,19
4,20,21,22,23,24


### 根据层级分组，将层级数值或层级名称传递给 level 关键字

In [20]:
df.groupby(level="level", axis="columns").count()

level,L1,L2
0,3,2
1,3,2
2,3,2
3,3,2
4,3,2
