# 10.1 缺失值的认定

In [2]:
import pandas as pd

# None 、 NaN 、pd.NA 都是缺失值

In [3]:
df = pd.DataFrame({'A':['a1','a1','a2','2'],'B':['b1','b2', None, 'b2'],'C':[1,2,3,4],'D':[5,6,None,8],'E':[5,None,7,8]})
df

Unnamed: 0,A,B,C,D,E
0,a1,b1,1,5.0,5.0
1,a1,b2,2,6.0,
2,a2,,3,,7.0
3,2,b2,4,8.0,8.0


In [5]:
# 全局设置正负无穷当作缺失值
pd.options.mode.use_inf_as_na = True

# 10.1.2 缺失值判断

In [7]:
# 判断缺失值，存在True，不存在False
df.isna()

Unnamed: 0,A,B,C,D,E
0,False,False,False,False,False
1,False,False,False,False,True
2,False,True,False,True,False
3,False,False,False,False,False


In [8]:
df.D.isna()

0    False
1    False
2     True
3    False
Name: D, dtype: bool

In [9]:
# 对 isna取反
df.notna()

Unnamed: 0,A,B,C,D,E
0,True,True,True,True,True
1,True,True,True,True,False
2,True,False,True,False,True
3,True,True,True,True,True


# 10.1.3 缺失值统计

In [11]:
# 布尔值统计
pd.Series([True,True,False]).sum()

2

In [12]:
# 统计每列的缺失值数量（为True的数量）
df.isnull().sum()
# df.isna().sum() 同上

A    0
B    1
C    0
D    1
E    1
dtype: int64

In [15]:
# 统计每行缺失值数量
df.isna().sum(axis=1)  # df.isna().sum(1)

0    0
1    1
2    2
3    0
dtype: int64

In [17]:
# 统计所有列总共有多少缺失值
df.isna().sum().sum()

3

# 10.1.4 缺失值筛选

In [18]:
df

Unnamed: 0,A,B,C,D,E
0,a1,b1,1,5.0,5.0
1,a1,b2,2,6.0,
2,a2,,3,,7.0
3,2,b2,4,8.0,8.0


In [22]:
# 筛选包含缺失值的行
df.loc[df.isna().any(1)]

  df.loc[df.isna().any(1)]


Unnamed: 0,A,B,C,D,E
1,a1,b2,2,6.0,
2,a2,,3,,7.0


In [20]:
# 筛选包含缺失值的列
df.loc[:, df.isna().any()]

Unnamed: 0,B,D,E
0,b1,5.0,5.0
1,b2,6.0,
2,,,7.0
3,b2,8.0,8.0


In [27]:
# 筛选没有缺失值的行
df.loc[~df.isna().any(1)]

  df.loc[~df.isna().any(1)]


Unnamed: 0,A,B,C,D,E
0,a1,b1,1,5.0,5.0
3,2,b2,4,8.0,8.0


In [25]:
# 筛选没有包含缺失值的列
df.loc[:, ~df.isna().any()]

Unnamed: 0,A,C
0,a1,1
1,a1,2
2,a2,3
3,2,4


# 10.1.5 NA标量（空整数、空布尔、空字符串）

In [30]:
s = pd.Series([1,2,None,4],dtype='Int64')
s

0       1
1       2
2    <NA>
3       4
dtype: Int64

In [32]:
s[2]

<NA>

In [33]:
# pd.NA本身是一个缺失值
s[2] is pd.NA

True

# 10.1.6 时间数据中的缺失值

In [34]:
# NaT表示时间缺失值，与NaN是兼容的。
pd.Series([pd.Timestamp('20200101'),None, pd.Timestamp('20200103')])


0   2020-01-01
1          NaT
2   2020-01-03
dtype: datetime64[ns]

# 10.1.7 整型数据中的缺失值

In [37]:
# 由于NaN是浮点型，因此缺少一个整数的列可以转换为整型


In [36]:
df

Unnamed: 0,A,B,C,D,E
0,a1,b1,1,5.0,5.0
1,a1,b2,2,6.0,
2,a2,,3,,7.0
3,2,b2,4,8.0,8.0


In [38]:
# nan是float64
type(df.at[2,'D'])

numpy.float64

In [39]:
df.at[2,'D']

nan

# 10.1.8 插入缺失值

In [41]:
# 将第一二行全部改为NaN
df.loc[0] = df.loc[1] = None

In [42]:
df

Unnamed: 0,A,B,C,D,E
0,,,,,
1,,,,,
2,a2,,3.0,,7.0
3,2,b2,4.0,8.0,8.0


In [43]:
# 将A列改为NA
df.A = pd.NA
df

Unnamed: 0,A,B,C,D,E
0,,,,,
1,,,,,
2,,,3.0,,7.0
3,,b2,4.0,8.0,8.0


# 10.2 缺失值的操作

# 10.2.1 缺失值填充

In [47]:
df = pd.DataFrame({'A':['a1','a1','a2','2'],'B':['b1','b2', None, 'b2'],'C':[1,2,3,4],'D':[5,6,None,8],'E':[5,None,7,8]})
df

Unnamed: 0,A,B,C,D,E
0,a1,b1,1,5.0,5.0
1,a1,b2,2,6.0,
2,a2,,3,,7.0
3,2,b2,4,8.0,8.0


In [48]:
# 将缺失值填充0
df1 = df
df1.fillna(0)

Unnamed: 0,A,B,C,D,E
0,a1,b1,1,5.0,5.0
1,a1,b2,2,6.0,0.0
2,a2,0,3,0.0,7.0
3,2,b2,4,8.0,8.0


In [51]:
# 将B列缺失值替换成kk1，并立即生效。
df2 = df
df2.B.fillna('kk1', inplace=True)

In [52]:
df2

Unnamed: 0,A,B,C,D,E
0,a1,b1,1,5.0,5.0
1,a1,b2,2,6.0,
2,a2,kk1,3,,7.0
3,2,b2,4,8.0,8.0


In [55]:
# 将不同列的缺失值替换为不同的值
df2.fillna({'D':'100','E':'101'})

Unnamed: 0,A,B,C,D,E
0,a1,b1,1,5.0,5.0
1,a1,b2,2,6.0,101.0
2,a2,kk1,3,100.0,7.0
3,2,b2,4,8.0,8.0


In [57]:
# 替换取后一个有效值填充
df.fillna(method='bfill')

Unnamed: 0,A,B,C,D,E
0,a1,b1,1,5.0,5.0
1,a1,b2,2,6.0,7.0
2,a2,kk1,3,8.0,7.0
3,2,b2,4,8.0,8.0


In [58]:
# 替换取前一个有效值填充
df.fillna(method='ffill')

Unnamed: 0,A,B,C,D,E
0,a1,b1,1,5.0,5.0
1,a1,b2,2,6.0,5.0
2,a2,kk1,3,6.0,7.0
3,2,b2,4,8.0,8.0


In [59]:
df

Unnamed: 0,A,B,C,D,E
0,a1,b1,1,5.0,5.0
1,a1,b2,2,6.0,
2,a2,kk1,3,,7.0
3,2,b2,4,8.0,8.0


In [61]:
# 将缺失值替换平均值
df.fillna(df.mean(numeric_only=None))

  df.fillna(df.mean(numeric_only=None))


Unnamed: 0,A,B,C,D,E
0,a1,b1,1,5.0,5.0
1,a1,b2,2,6.0,6.666667
2,a2,kk1,3,6.333333,7.0
3,2,b2,4,8.0,8.0


In [62]:
df

Unnamed: 0,A,B,C,D,E
0,a1,b1,1,5.0,5.0
1,a1,b2,2,6.0,
2,a2,kk1,3,,7.0
3,2,b2,4,8.0,8.0


In [64]:
# 对指定列D进行替换？？？
df.fillna(df.mean()['D'])

  df.fillna(df.mean()['D'])


Unnamed: 0,A,B,C,D,E
0,a1,b1,1,5.0,5.0
1,a1,b2,2,6.0,6.333333
2,a2,kk1,3,6.333333,7.0
3,2,b2,4,8.0,8.0


In [66]:
# 替换指定列的空值为111
import numpy as np
df.replace({'D':{np.nan:111}})

Unnamed: 0,A,B,C,D,E
0,a1,b1,1,5.0,5.0
1,a1,b2,2,6.0,
2,a2,kk1,3,111.0,7.0
3,2,b2,4,8.0,8.0


# 10.2.2 插值填充

# 10.2.3  缺失值删除

In [76]:
df = pd.DataFrame({'A':['a1','a1','a2',None],'B':['b1','b2', None, None],'C':[1,2,3,None],'D':[5,6,None,None],'E':[5,None,7,None]})
df

Unnamed: 0,A,B,C,D,E
0,a1,b1,1.0,5.0,5.0
1,a1,b2,2.0,6.0,
2,a2,,3.0,,7.0
3,,,,,


In [77]:
# 删除有缺失值的行
df.dropna()

Unnamed: 0,A,B,C,D,E
0,a1,b1,1.0,5.0,5.0


In [78]:
# 删除有缺失值的列
# df.dropna(1)
# df.dropna(axis=1)
df.dropna(axis="columns")

0
1
2
3


In [83]:
# 删除所有值都缺失的行。
df.dropna(how='all')

Unnamed: 0,A,B,C,D,E
0,a1,b1,1.0,5.0,5.0
1,a1,b2,2.0,6.0,
2,a2,,3.0,,7.0


In [84]:
df.dropna(subset=['A','E'])

Unnamed: 0,A,B,C,D,E
0,a1,b1,1.0,5.0,5.0
2,a2,,3.0,,7.0


# 10.2.4 缺失值参与计算

In [89]:
df = pd.DataFrame({'A':['a1','a1','a2','2'],'B':['b1','b2', None, 'b2'],'C':[1,2,3,4],'D':[5,6,None,8],'E':[5,None,7,8]})
df

Unnamed: 0,A,B,C,D,E
0,a1,b1,1,5.0,5.0
1,a1,b2,2,6.0,
2,a2,,3,,7.0
3,2,b2,4,8.0,8.0


In [90]:
# 对所有列求和，忽略非数字列中包含有缺失值的
df.sum()

  df.sum()


A    a1a1a22
C         10
D       19.0
E       20.0
dtype: object

In [91]:
# 累加
df.D.cumsum()

0     5.0
1    11.0
2     NaN
3    19.0
Name: D, dtype: float64

In [92]:
# 累加,跳过空值
df.D.cumsum(skipna=False)

0     5.0
1    11.0
2     NaN
3     NaN
Name: D, dtype: float64

In [93]:
# 统计每列数量，缺失值不统计。
df.count()

A    4
B    3
C    4
D    3
E    3
dtype: int64

In [94]:
# 对B进行聚合分组，忽略缺失值
df.groupby('B').sum()

  df.groupby('B').sum()


Unnamed: 0_level_0,C,D,E
B,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
b1,1,5.0,5.0
b2,6,14.0,8.0


In [95]:
df

Unnamed: 0,A,B,C,D,E
0,a1,b1,1,5.0,5.0
1,a1,b2,2,6.0,
2,a2,,3,,7.0
3,2,b2,4,8.0,8.0


In [96]:
# 聚合分组吧，包括缺失值
df.groupby('B', dropna=False).sum()

  df.groupby('B', dropna=False).sum()


Unnamed: 0_level_0,C,D,E
B,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
b1,1,5.0,5.0
b2,6,14.0,8.0
,3,0.0,7.0
