In [2]:
import pandas as pd

# None 、 NaN 、pd.NA 都是缺失值

# 10.2.1 缺失值填充

In [3]:
df = pd.DataFrame({'A':['a1','a1','a2','2'],'B':['b1','b2', None, 'b2'],'C':[1,2,3,4],'D':[5,6,None,8],'E':[5,None,7,8]})
df

Unnamed: 0,A,B,C,D,E
0,a1,b1,1,5.0,5.0
1,a1,b2,2,6.0,
2,a2,,3,,7.0
3,2,b2,4,8.0,8.0


In [4]:
# 将缺失值填充0
df1 = df
df1.fillna(0)

Unnamed: 0,A,B,C,D,E
0,a1,b1,1,5.0,5.0
1,a1,b2,2,6.0,0.0
2,a2,0,3,0.0,7.0
3,2,b2,4,8.0,8.0


In [5]:
# 将B列缺失值替换成kk1，并立即生效。
df2 = df
df2.B.fillna('kk1', inplace=True)

In [6]:
df2

Unnamed: 0,A,B,C,D,E
0,a1,b1,1,5.0,5.0
1,a1,b2,2,6.0,
2,a2,kk1,3,,7.0
3,2,b2,4,8.0,8.0


In [7]:
# 将不同列的缺失值替换为不同的值
df2.fillna({'D':'100','E':'101'})

Unnamed: 0,A,B,C,D,E
0,a1,b1,1,5.0,5.0
1,a1,b2,2,6.0,101.0
2,a2,kk1,3,100.0,7.0
3,2,b2,4,8.0,8.0


In [8]:
# 替换取后一个有效值填充
df.fillna(method='bfill')

Unnamed: 0,A,B,C,D,E
0,a1,b1,1,5.0,5.0
1,a1,b2,2,6.0,7.0
2,a2,kk1,3,8.0,7.0
3,2,b2,4,8.0,8.0


In [9]:
# 替换取前一个有效值填充
df.fillna(method='ffill')

Unnamed: 0,A,B,C,D,E
0,a1,b1,1,5.0,5.0
1,a1,b2,2,6.0,5.0
2,a2,kk1,3,6.0,7.0
3,2,b2,4,8.0,8.0


In [10]:
df

Unnamed: 0,A,B,C,D,E
0,a1,b1,1,5.0,5.0
1,a1,b2,2,6.0,
2,a2,kk1,3,,7.0
3,2,b2,4,8.0,8.0


In [11]:
# 将缺失值替换平均值
df.fillna(df.mean(numeric_only=None))

  df.fillna(df.mean(numeric_only=None))


Unnamed: 0,A,B,C,D,E
0,a1,b1,1,5.0,5.0
1,a1,b2,2,6.0,6.666667
2,a2,kk1,3,6.333333,7.0
3,2,b2,4,8.0,8.0


In [12]:
df

Unnamed: 0,A,B,C,D,E
0,a1,b1,1,5.0,5.0
1,a1,b2,2,6.0,
2,a2,kk1,3,,7.0
3,2,b2,4,8.0,8.0


In [13]:
# 对指定列D进行替换？？？
df.fillna(df.mean()['D'])

  df.fillna(df.mean()['D'])


Unnamed: 0,A,B,C,D,E
0,a1,b1,1,5.0,5.0
1,a1,b2,2,6.0,6.333333
2,a2,kk1,3,6.333333,7.0
3,2,b2,4,8.0,8.0


In [14]:
# 替换指定列的空值为111
import numpy as np
df.replace({'D':{np.nan:111}})

Unnamed: 0,A,B,C,D,E
0,a1,b1,1,5.0,5.0
1,a1,b2,2,6.0,
2,a2,kk1,3,111.0,7.0
3,2,b2,4,8.0,8.0


# 10.2.2 插值填充

# 10.2.3  缺失值删除

In [15]:
df = pd.DataFrame({'A':['a1','a1','a2',None],'B':['b1','b2', None, None],'C':[1,2,3,None],'D':[5,6,None,None],'E':[5,None,7,None]})
df

Unnamed: 0,A,B,C,D,E
0,a1,b1,1.0,5.0,5.0
1,a1,b2,2.0,6.0,
2,a2,,3.0,,7.0
3,,,,,


In [16]:
# 删除有缺失值的行
df.dropna()

Unnamed: 0,A,B,C,D,E
0,a1,b1,1.0,5.0,5.0


In [17]:
# 删除有缺失值的列
# df.dropna(1)
# df.dropna(axis=1)
df.dropna(axis="columns")

0
1
2
3


In [18]:
# 删除所有值都缺失的行。
df.dropna(how='all')

Unnamed: 0,A,B,C,D,E
0,a1,b1,1.0,5.0,5.0
1,a1,b2,2.0,6.0,
2,a2,,3.0,,7.0


In [19]:
df.dropna(subset=['A','E'])

Unnamed: 0,A,B,C,D,E
0,a1,b1,1.0,5.0,5.0
2,a2,,3.0,,7.0


# 10.2.4 缺失值参与计算

In [20]:
df = pd.DataFrame({'A':['a1','a1','a2','2'],'B':['b1','b2', None, 'b2'],'C':[1,2,3,4],'D':[5,6,None,8],'E':[5,None,7,8]})
df

Unnamed: 0,A,B,C,D,E
0,a1,b1,1,5.0,5.0
1,a1,b2,2,6.0,
2,a2,,3,,7.0
3,2,b2,4,8.0,8.0


In [21]:
# 对所有列求和，忽略非数字列中包含有缺失值的
df.sum()

  df.sum()


A    a1a1a22
C         10
D       19.0
E       20.0
dtype: object

In [22]:
# 累加
df.D.cumsum()

0     5.0
1    11.0
2     NaN
3    19.0
Name: D, dtype: float64

In [23]:
# 累加,跳过空值
df.D.cumsum(skipna=False)

0     5.0
1    11.0
2     NaN
3     NaN
Name: D, dtype: float64

In [24]:
# 统计每列数量，缺失值不统计。
df.count()

A    4
B    3
C    4
D    3
E    3
dtype: int64

In [25]:
# 对B进行聚合分组，忽略缺失值
df.groupby('B').sum()

  df.groupby('B').sum()


Unnamed: 0_level_0,C,D,E
B,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
b1,1,5.0,5.0
b2,6,14.0,8.0


In [26]:
df

Unnamed: 0,A,B,C,D,E
0,a1,b1,1,5.0,5.0
1,a1,b2,2,6.0,
2,a2,,3,,7.0
3,2,b2,4,8.0,8.0


In [27]:
# 聚合分组吧，包括缺失值
df.groupby('B', dropna=False).sum()

  df.groupby('B', dropna=False).sum()


Unnamed: 0_level_0,C,D,E
B,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
b1,1,5.0,5.0
b2,6,14.0,8.0
,3,0.0,7.0
