In [1]:
import numpy as np
import pandas as pd

# 1.数据聚合：

In [2]:
rng = np.random.default_rng(12345)

df = pd.DataFrame({"key1" : ["a", "a", None, "b", "b", "a", None],
                   "key2" : pd.Series([1, 2, 1, 2, 1, None, 1], dtype="Int64"),
                   "data1" : np.round(rng.standard_normal(7) , 2),
                   "data2" : np.round(rng.standard_normal(7) , 2)})

df

Unnamed: 0,key1,key2,data1,data2
0,a,1.0,-1.42,0.65
1,a,2.0,1.26,0.36
2,,1.0,-0.87,-1.95
3,b,2.0,-0.26,2.35
4,b,1.0,-0.08,0.97
5,a,,-0.74,-0.76
6,,1.0,-1.37,0.9


### nsmallest 

In [3]:
grouped = df.groupby("key1")

In [4]:
grouped["data1"].nsmallest(2)

key1   
a     0   -1.42
      5   -0.74
b     3   -0.26
      4   -0.08
Name: data1, dtype: float64

### 使用自定义的聚合函数

In [5]:
# 自定义函数，每列最大值-最小值
def peak_to_peak(arr):
    return arr.max() - arr.min()

In [6]:
df.groupby("key1").agg(peak_to_peak)

Unnamed: 0_level_0,key2,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,2.68,1.41
b,1,0.18,1.38


### 某些方法（如 describe）也是有效的，尽管严格来说它们不是聚合函数

In [7]:
df.groupby("key1").describe()

Unnamed: 0_level_0,key2,key2,key2,key2,key2,key2,key2,key2,data1,data1,data1,data1,data1,data2,data2,data2,data2,data2,data2,data2,data2
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
key1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
a,2.0,1.5,0.707107,1.0,1.25,1.5,1.75,2.0,3.0,-0.3,...,0.26,1.26,3.0,0.083333,0.744603,-0.76,-0.2,0.36,0.505,0.65
b,2.0,1.5,0.707107,1.0,1.25,1.5,1.75,2.0,2.0,-0.17,...,-0.125,-0.08,2.0,1.66,0.975807,0.97,1.315,1.66,2.005,2.35


# 2.逐列或多函数应用：

In [8]:
df = pd.DataFrame({"key1" : ["a", "a",  "b", "b", "a", "b"],
                   "key2" : pd.Series(["x", "y",  "y", "x", "x", "y"]),
                   "data1" : [1, 2, 3, 4, 5, 6],
                   "data2" : [7, 8, 9, 10, 11, 12]})

df

Unnamed: 0,key1,key2,data1,data2
0,a,x,1,7
1,a,y,2,8
2,b,y,3,9
3,b,x,4,10
4,a,x,5,11
5,b,y,6,12


### 对于上述表 中的描述性统计信息，可以将函数名称作为字符串传递

In [9]:
df.groupby(["key1","key2"])["data1"].agg("mean")

key1  key2
a     x       3.0
      y       2.0
b     x       4.0
      y       4.5
Name: data1, dtype: float64

### 如果改为传递函数列表或函数名称，则会返回一个 DataFrame，其中列名取自函数名

In [10]:
df.groupby(["key1","key2"])["data1"].agg(["mean","std",peak_to_peak])

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std,peak_to_peak
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
a,x,3.0,2.828427,4
a,y,2.0,,0
b,x,4.0,,0
b,y,4.5,2.12132,3


### 修改列名

In [11]:
df.groupby(["key1","key2"])["data1"].agg([("average", "mean"), ("stdev", np.std), ("Custom_Func", peak_to_peak)])

Unnamed: 0_level_0,Unnamed: 1_level_0,average,stdev,Custom_Func
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
a,x,3.0,2.828427,4
a,y,2.0,,0
b,x,4.0,,0
b,y,4.5,2.12132,3


### 可以指定一个函数列表以应用于所有列

In [12]:
functions = ["count", "mean", "max"]
df.groupby(["key1","key2"]).agg(functions)

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data1,data1,data2,data2,data2
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,max,count,mean,max
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
a,x,2,3.0,5,2,9.0,11
a,y,1,2.0,2,1,8.0,8
b,x,1,4.0,4,1,10.0,10
b,y,2,4.5,6,2,10.5,12


In [13]:
# 也可以传递具有自定义名称的元组列表
ftuples = [("Average", "mean"), ("Custom_Func", peak_to_peak)]
df.groupby(["key1","key2"]).agg(ftuples)

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data1,data2,data2
Unnamed: 0_level_1,Unnamed: 1_level_1,Average,Custom_Func,Average,Custom_Func
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
a,x,3.0,4,9.0,4
a,y,2.0,0,8.0,0
b,x,4.0,0,10.0,0
b,y,4.5,3,10.5,3


### 对一列或多列应用不同的函数

In [14]:
df.groupby(["key1","key2"]).agg({"data1" : np.max, "data2" : "sum"})

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,x,5,18
a,y,2,8
b,x,4,10
b,y,6,21


In [15]:
df.groupby(["key1","key2"]).agg({"data1" : ["min", "max", "mean", "std"],"data2" : [("Average", "mean"), ("Custom_Func", peak_to_peak)]})

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data1,data1,data1,data2,data2
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean,std,Average,Custom_Func
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
a,x,1,5,3.0,2.828427,9.0,4
a,y,2,2,2.0,,8.0,0
b,x,4,4,4.0,,10.0,0
b,y,3,6,4.5,2.12132,10.5,3


# 3.返回不含行索引的聚合数据：

In [16]:
df.groupby(["key1","key2"]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,x,3.0,9.0
a,y,2.0,8.0
b,x,4.0,10.0
b,y,4.5,10.5


In [17]:
df.groupby(["key1","key2"] , as_index=False).mean()

Unnamed: 0,key1,key2,data1,data2
0,a,x,3.0,9.0
1,a,y,2.0,8.0
2,b,x,4.0,10.0
3,b,y,4.5,10.5
