In [1]:
import numpy as np
import pandas as pd

# 1.使用指定分组值填充缺失值：

### fillna

In [2]:
df = pd.DataFrame({"data1":[1,2,3,4,5,6],"data2":[7,8,9,10,11,12]})
df.iloc[[0,2,4],0]=np.nan
df.iloc[[1,3,5],1]=np.nan
df

Unnamed: 0,data1,data2
0,,7.0
1,2.0,
2,,9.0
3,4.0,
4,,11.0
5,6.0,


In [3]:
s = df["data1"]
s

0    NaN
1    2.0
2    NaN
3    4.0
4    NaN
5    6.0
Name: data1, dtype: float64

In [4]:
s.fillna(s.mean())

0    4.0
1    2.0
2    4.0
3    4.0
4    4.0
5    6.0
Name: data1, dtype: float64

In [5]:
df.fillna(df.mean())

Unnamed: 0,data1,data2
0,4.0,7.0
1,2.0,9.0
2,4.0,9.0
3,4.0,9.0
4,4.0,11.0
5,6.0,9.0


### 填充值因组而异

In [6]:
index_keys = ["a","b","c","d","e","f","g","h"]
group_keys = ["E","E","E","E","W","W","W","W"]
data = [1,2,3,4,5,6,7,8]

In [7]:
df = pd.DataFrame({"group_keys":group_keys,
                  "data":data},
                  index=index_keys)
df.iloc[[0,2,5,7],1] = np.nan
df

Unnamed: 0,group_keys,data
a,E,
b,E,2.0
c,E,
d,E,4.0
e,W,5.0
f,W,
g,W,7.0
h,W,


In [8]:
df.groupby("group_keys").mean()

Unnamed: 0_level_0,data
group_keys,Unnamed: 1_level_1
E,3.0
W,6.0


In [9]:
# 计算组均值的函数
def fill_mean(group):
    return group.fillna(group.mean())

In [10]:
# 使用组均值填充 NA 值
df.groupby("group_keys")[["data"]].apply(fill_mean)

Unnamed: 0,data
a,3.0
b,2.0
c,3.0
d,4.0
e,5.0
f,6.0
g,7.0
h,6.0


### 为每个分组预定义了填充值

In [11]:
fill_values = {"E": 1, "W": -1}

In [12]:
# 使用每个分组的内置name属性获取预定义值
def fill_func(group):
    return group.fillna(fill_values[group.name])

In [13]:
df.groupby("group_keys")[["data"]].apply(fill_func)

Unnamed: 0,data
a,1.0
b,2.0
c,1.0
d,4.0
e,5.0
f,-1.0
g,7.0
h,-1.0


# 2.随机采样和排列：

In [14]:
keys = ["a","b","c","d"]
nums = list(range(1, 6))

In [15]:
index_key = []

for key in keys:
    index_key.extend(str(n)+key for n in nums)

# print(index_key)

In [16]:
data = pd.DataFrame({"data1":np.arange(20),
                 "data2":np.random.standard_normal(20)}, index=index_key)
data

Unnamed: 0,data1,data2
1a,0,1.259528
2a,1,-1.373935
3a,2,-0.286759
4a,3,-2.595734
5a,4,-1.029578
1b,5,0.182814
2b,6,1.114865
3b,7,0.786265
4b,8,-0.222014
5b,9,0.949835


### 从data中随机选择5组数据

In [17]:
def sample_func(data,n=5):
    return data.sample(n)

In [18]:
sample_func(data)

Unnamed: 0,data1,data2
3d,17,-0.780712
1d,15,0.740196
4c,13,-0.621632
2a,1,-1.373935
4a,3,-2.595734


### 根据索引最后的字母分组随机选择5组数据

In [19]:
#  获取索引最后一个字母
def get_char(data):
    return data[-1]

In [20]:
# 分组后使用 apply
data.groupby(get_char).apply(sample_func , n=3)

Unnamed: 0,Unnamed: 1,data1,data2
a,4a,3,-2.595734
a,3a,2,-0.286759
a,1a,0,1.259528
b,3b,7,0.786265
b,2b,6,1.114865
b,1b,5,0.182814
c,1c,10,0.460335
c,3c,12,0.306945
c,5c,14,0.318553
d,1d,15,0.740196


### 可以通过 group_keys=False 来删除外索引

In [21]:
data.groupby(get_char , group_keys=False).apply(sample_func , n=3)

Unnamed: 0,data1,data2
1a,0,1.259528
4a,3,-2.595734
5a,4,-1.029578
1b,5,0.182814
2b,6,1.114865
3b,7,0.786265
5c,14,0.318553
3c,12,0.306945
1c,10,0.460335
4d,18,-2.354548


# 3.分组加权：

In [22]:
df = pd.DataFrame({"category": ["a", "a", "a", "a","b", "b", "b", "b"],
                   "data": np.arange(0,8),
                   "weights": np.arange(8,16)})

df

Unnamed: 0,category,data,weights
0,a,0,8
1,a,1,9
2,a,2,10
3,a,3,11
4,b,4,12
5,b,5,13
6,b,6,14
7,b,7,15


### 按category的加权平均值

In [23]:
# 使用np.average 计算加权平均值
def get_wavg(group):
    return np.average(group["data"],weights=group["weights"])

In [24]:
# 计算加权平均值→(0*8 + 1*9 + 2*10 +3*11)/(8+9+10+11)
df.groupby("category").apply(get_wavg)

category
a    1.631579
b    5.592593
dtype: float64

# 4.相关性：

In [25]:
df = pd.DataFrame({"category": ["a", "a", "a", "a","b", "b", "b", "b"],
                   "data1": np.arange(0,8),
                   "data2": np.random.standard_normal(8),
                   "data3": np.random.standard_normal(8)})

df

Unnamed: 0,category,data1,data2,data3
0,a,0,0.164135,1.455387
1,a,1,0.627564,0.095002
2,a,2,-0.181895,-0.342263
3,a,3,0.419431,-2.174636
4,b,4,0.980285,1.162548
5,b,5,2.40067,-0.229024
6,b,6,-2.321926,-0.72879
7,b,7,-1.555422,1.436018


In [26]:
# 定义计算相关性的函数
def corr_func(group):
    return group.corrwith(group["data3"])

In [27]:
df.groupby("category").apply(corr_func)

Unnamed: 0_level_0,data1,data2,data3
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,-0.975637,-0.192981,1.0
b,0.039324,0.036332,1.0


# 5.逐组线性回归：

In [28]:
df = pd.DataFrame({"category": ["a", "a", "a", "a","b", "b", "b", "b"],
                   "Y": np.arange(0,8),
                   "X": np.random.standard_normal(8)})

df

Unnamed: 0,category,Y,X
0,a,0,-0.728376
1,a,1,0.121379
2,a,2,0.36694
3,a,3,1.941715
4,b,4,0.761987
5,b,5,0.598286
6,b,6,-1.618109
7,b,7,-0.5518


In [29]:
import statsmodels.api as sm
def regress(data, yvar=None, xvars=None):
    Y = data[yvar]
    X = data[xvars]
    X["intercept"] = 1.
    result = sm.OLS(Y, X).fit()
    return result.params

In [30]:
# 计算 Y 对 X 的分组线性回归
df.groupby("category").apply(regress, yvar="Y", xvars=["X"])

Unnamed: 0_level_0,X,intercept
category,Unnamed: 1_level_1,Unnamed: 2_level_1
a,1.107792,1.028729
b,-0.832703,5.331453
