In [2]:
import pandas as pd


In [3]:
# 设置展示的行数和列数限制
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

pd.DataFrame({'c1': range(10), 'c2': range(100, 110)})

Unnamed: 0,c1,c2
0,0,100
1,1,101
2,2,102
3,3,103
4,4,104
5,5,105
6,6,106
7,7,107
8,8,108
9,9,109


## 缺失值处理
- 缺失值判断
- 删除缺失值
- 替换缺失值
- 填充缺失值

参考: https://www.cnblogs.com/zhangyafei/p/10513515.html

In [4]:
import pandas as pd
import numpy as np
index = pd.Index(data=["Tom", "Bob", "Mary", "James", "Andy", "Alice"], name="name")
data = {
    "age": [18, 30, np.nan, 40, np.nan, 30],
    "city": ["BeiJing", "ShangHai", "GuangZhou", "ShenZhen", np.nan, " "],
    "sex": [None, "male", "female", "male", np.nan, "unknown"],
    "birth": ["2000-02-10", "1988-10-17", None, "1978-08-08", np.nan, "1988-10-17"]
}
user_info = pd.DataFrame(data=data, index=index)
user_info

Unnamed: 0_level_0,age,city,sex,birth
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Tom,18.0,BeiJing,,2000-02-10
Bob,30.0,ShangHai,male,1988-10-17
Mary,,GuangZhou,female,
James,40.0,ShenZhen,male,1978-08-08
Andy,,,,
Alice,30.0,,unknown,1988-10-17


In [5]:
## 判断缺失值
user_info.isna()  # user_info.isnull()

Unnamed: 0_level_0,age,city,sex,birth
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Tom,False,False,True,False
Bob,False,False,False,False
Mary,True,False,False,True
James,False,False,False,False
Andy,True,True,True,True
Alice,False,False,False,False


dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)
- axis 参数用于控制行或列，跟其他不一样的是，axis=0 （默认）表示操作行，axis=1 表示操作列。
- how 参数可选的值为 any（默认） 或者 all。any 表示一行/列有任意元素为空时即丢弃，all 一行/列所有值都为空时才丢弃。
- subset 参数表示删除时只考虑的索引或列名。
- thresh参数的类型为整数，它的作用是，比如 thresh=3，会在一行/列中至少有 3 个非空值时将其保留。

In [6]:
## 删除缺失值
user_info.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)

Unnamed: 0_level_0,age,city,sex,birth
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bob,30.0,ShangHai,male,1988-10-17
James,40.0,ShenZhen,male,1978-08-08
Alice,30.0,,unknown,1988-10-17


In [7]:
## 缺失值填充
user_info.age.fillna(0)

name
Tom      18.0
Bob      30.0
Mary      0.0
James    40.0
Andy      0.0
Alice    30.0
Name: age, dtype: float64

In [8]:
user_info.age.fillna(method="ffill")

name
Tom      18.0
Bob      30.0
Mary     30.0
James    40.0
Andy     40.0
Alice    30.0
Name: age, dtype: float64

In [9]:
user_info.age.fillna(method="backfill")

name
Tom      18.0
Bob      30.0
Mary     40.0
James    40.0
Andy     30.0
Alice    30.0
Name: age, dtype: float64

In [10]:
user_info.age.interpolate()

name
Tom      18.0
Bob      30.0
Mary     35.0
James    40.0
Andy     35.0
Alice    30.0
Name: age, dtype: float64

## DataFrame值替换

In [11]:
## 缺失值替换： 将空格替换为np.nan
user_info.city.replace(r'\s+', np.nan, regex=True)

name
Tom        BeiJing
Bob       ShangHai
Mary     GuangZhou
James     ShenZhen
Andy           NaN
Alice          NaN
Name: city, dtype: object

In [13]:
## 缺失值替换： 将np.nan替换为空格
user_info.city.replace(np.nan, '', regex=True)

name
Tom        BeiJing
Bob       ShangHai
Mary     GuangZhou
James     ShenZhen
Andy              
Alice             
Name: city, dtype: object

In [14]:
# 多列替换
user_info.replace({"age": 40, "birth": pd.Timestamp("1978-08-08")}, np.nan)

Unnamed: 0_level_0,age,city,sex,birth
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Tom,18.0,BeiJing,,2000-02-10
Bob,30.0,ShangHai,male,1988-10-17
Mary,,GuangZhou,female,
James,,ShenZhen,male,1978-08-08
Andy,,,,
Alice,30.0,,unknown,1988-10-17


In [16]:
# 多值替换
user_info.age.replace({40: np.nan, 18: 18.8})

name
Tom      18.8
Bob      30.0
Mary      NaN
James     NaN
Andy      NaN
Alice    30.0
Name: age, dtype: float64

缺失值替换
> 除了我们自己手动丢弃、填充已经替换缺失值之外，我们还可以使用其他对象来填充。
　　例如有两个关于用户年龄的 Series，其中一个有缺失值，另一个没有，我们可以将没有的缺失值的 Series 中的元素传给有缺失值的。

In [20]:
user_info2 = pd.Series([18.5, 30.9, 27.2, 40.2, 36.9, 30.0], index=user_info.age.index)
user_info.age.combine_first(user_info2)

name
Tom      18.0
Bob      30.0
Mary     27.2
James    40.0
Andy     36.9
Alice    30.0
Name: age, dtype: float64