In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.Series([1, np.nan,2, None,3], dtype='float64')
data

0    1.0
1    NaN
2    2.0
3    NaN
4    3.0
dtype: float64

In [3]:
data.isna()

0    False
1     True
2    False
3     True
4    False
dtype: bool

In [4]:
data = pd.Series(["a", np.nan,"b", None, "c"])
data

0       a
1     NaN
2       b
3    None
4       c
dtype: object

In [5]:
data.isna()

0    False
1     True
2    False
3     True
4    False
dtype: bool

# 1.过滤缺失值：

### 1.1 Series对象：

In [6]:
obj = pd.Series([1,np.nan,3,np.nan,5])
obj

0    1.0
1    NaN
2    3.0
3    NaN
4    5.0
dtype: float64

In [7]:
# dropna()方法
obj.dropna()

0    1.0
2    3.0
4    5.0
dtype: float64

In [8]:
# 布尔值索引过滤
obj[obj.notna()]

0    1.0
2    3.0
4    5.0
dtype: float64

### 1.2 DataFrame对象：

In [9]:
df = pd.DataFrame([[1,5,9],[2,np.nan,np.nan],[np.nan,np.nan,np.nan],[np.nan,8,2]],columns=["a","b","c"])
df

Unnamed: 0,a,b,c
0,1.0,5.0,9.0
1,2.0,,
2,,,
3,,8.0,2.0


#### 删除包含缺失值的行

In [10]:
df.dropna()

Unnamed: 0,a,b,c
0,1.0,5.0,9.0


#### 删除所有值均为缺失值的行

In [11]:
df.dropna(how="all")

Unnamed: 0,a,b,c
0,1.0,5.0,9.0
1,2.0,,
3,,8.0,2.0


#### 删除包含指定数量缺失值的行

In [12]:
df.dropna(thresh=2)

Unnamed: 0,a,b,c
0,1.0,5.0,9.0
3,,8.0,2.0


#### 删除包含缺失值的列

In [13]:
df = pd.DataFrame([[1,5,9,np.nan],[2,np.nan,7,np.nan],[np.nan,np.nan,np.nan,np.nan],[np.nan,8,2,np.nan]],columns=["a","b","c","d"])
df

Unnamed: 0,a,b,c,d
0,1.0,5.0,9.0,
1,2.0,,7.0,
2,,,,
3,,8.0,2.0,


In [14]:
df.dropna(axis=1,how="all",thresh=3)

Unnamed: 0,c
0,9.0
1,7.0
2,
3,2.0


# 2.填充缺失值：

In [15]:
df = pd.DataFrame(np.arange(18).reshape(6,3),columns=["a","b","c"],dtype="float64")
df.iloc[:3,1] = np.nan
df.iloc[-3:,2] = np.nan
df

Unnamed: 0,a,b,c
0,0.0,,2.0
1,3.0,,5.0
2,6.0,,8.0
3,9.0,10.0,
4,12.0,13.0,
5,15.0,16.0,


#### 调用 fillna 可以使一个常量值替换缺失值

In [16]:
df.fillna(99)

Unnamed: 0,a,b,c
0,0.0,99.0,2.0
1,3.0,99.0,5.0
2,6.0,99.0,8.0
3,9.0,10.0,99.0
4,12.0,13.0,99.0
5,15.0,16.0,99.0


#### 使用字典调用fillna，可以为每一列使用不同的填充值

In [17]:
df.fillna({"b":99 , "c":999})

Unnamed: 0,a,b,c
0,0.0,99.0,2.0
1,3.0,99.0,5.0
2,6.0,99.0,8.0
3,9.0,10.0,999.0
4,12.0,13.0,999.0
5,15.0,16.0,999.0


#### 用于重建索引的插值方法也适用于fillna

In [18]:
df.fillna(method="bfill")

Unnamed: 0,a,b,c
0,0.0,10.0,2.0
1,3.0,10.0,5.0
2,6.0,10.0,8.0
3,9.0,10.0,
4,12.0,13.0,
5,15.0,16.0,


In [19]:
df.fillna(method="ffill",limit=2)

Unnamed: 0,a,b,c
0,0.0,,2.0
1,3.0,,5.0
2,6.0,,8.0
3,9.0,10.0,8.0
4,12.0,13.0,8.0
5,15.0,16.0,


#### 日常数据分析中，一般会采用使用数据的平均值或中位数来填充缺失值

In [20]:
df.fillna(df.mean())

Unnamed: 0,a,b,c
0,0.0,13.0,2.0
1,3.0,13.0,5.0
2,6.0,13.0,8.0
3,9.0,10.0,5.0
4,12.0,13.0,5.0
5,15.0,16.0,5.0
