In [1]:
import pandas as pd
import numpy as np

# 检测缺失数据
使用`.isnull()`或`.notnull()`可以检测出所有`NaN`值

In [6]:
a = pd.Series(np.random.randn(6))
a[4] = np.NAN
print(a)
print(a.isnull())

0    1.029854
1    0.760568
2    0.079467
3   -0.822699
4         NaN
5   -0.404240
dtype: float64
0    False
1    False
2    False
3    False
4     True
5    False
dtype: bool


In [8]:
b = pd.DataFrame(np.random.randn(4,4))
b[2][2] = np.NaN
print(b)
print(b.notnull())

          0         1         2         3
0 -0.234529  0.312595  0.750550  0.021029
1 -0.037202  0.585013  0.586366 -1.023269
2  0.090427  0.004957       NaN  2.424920
3 -1.591392  0.210765 -0.970668  1.757776
      0     1      2     3
0  True  True   True  True
1  True  True   True  True
2  True  True  False  True
3  True  True   True  True


# 处理缺失值
## 丢弃
### Series丢弃缺失值
使用`Series.dropna()`方法可丢弃缺失值

In [9]:
print(a)
print(a.dropna())

0    1.029854
1    0.760568
2    0.079467
3   -0.822699
4         NaN
5   -0.404240
dtype: float64
0    1.029854
1    0.760568
2    0.079467
3   -0.822699
5   -0.404240
dtype: float64


### DataFrame丢弃缺失值
DataFrame使用`dropna()`方法时，可以传入以下参数
- axis（默认0）：丢弃行（0）还是丢弃列（1）
- how（默认""）：有一个NaN就丢弃（默认）还是全是NaN才丢弃（"all"）

In [11]:
print(b)
print(b.dropna())
print(b.dropna(axis=1))

          0         1         2         3
0 -0.234529  0.312595  0.750550  0.021029
1 -0.037202  0.585013  0.586366 -1.023269
2  0.090427  0.004957       NaN  2.424920
3 -1.591392  0.210765 -0.970668  1.757776
          0         1         2         3
0 -0.234529  0.312595  0.750550  0.021029
1 -0.037202  0.585013  0.586366 -1.023269
3 -1.591392  0.210765 -0.970668  1.757776
          0         1         3
0 -0.234529  0.312595  0.021029
1 -0.037202  0.585013 -1.023269
2  0.090427  0.004957  2.424920
3 -1.591392  0.210765  1.757776


In [12]:
print(b.dropna(how="all"))

          0         1         2         3
0 -0.234529  0.312595  0.750550  0.021029
1 -0.037202  0.585013  0.586366 -1.023269
2  0.090427  0.004957       NaN  2.424920
3 -1.591392  0.210765 -0.970668  1.757776


## 填充
使用`.fillna()`方法可以将NaN值改为传入的数。传入字典可根据索引有选择的填充；传入`inplace=True`可以直接修改原数据

In [14]:
print(a)
print(a.fillna(100))
print(a.fillna({1:100,4:400}))

0    1.029854
1    0.760568
2    0.079467
3   -0.822699
4         NaN
5   -0.404240
dtype: float64
0      1.029854
1      0.760568
2      0.079467
3     -0.822699
4    100.000000
5     -0.404240
dtype: float64
0      1.029854
1      0.760568
2      0.079467
3     -0.822699
4    400.000000
5     -0.404240
dtype: float64


In [15]:
print(b)
print(b.fillna(0,inplace=True))
print(b)

          0         1         2         3
0 -0.234529  0.312595  0.750550  0.021029
1 -0.037202  0.585013  0.586366 -1.023269
2  0.090427  0.004957       NaN  2.424920
3 -1.591392  0.210765 -0.970668  1.757776
None
          0         1         2         3
0 -0.234529  0.312595  0.750550  0.021029
1 -0.037202  0.585013  0.586366 -1.023269
2  0.090427  0.004957  0.000000  2.424920
3 -1.591392  0.210765 -0.970668  1.757776
