DataFrame的Nan值处理

In [3]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

### isnull()/notnull()

In [4]:
df = DataFrame({
    'sex': np.random.choice(['男', '女'], size=5),
    'height': np.random.randint(155, 190, size=5),
    'weight': np.random.randint(50, 120, size=5),
    'age': np.random.randint(18, 30, size=5)
})
df

Unnamed: 0,sex,height,weight,age
0,女,186,65,19
1,女,189,94,24
2,女,180,74,22
3,男,183,52,19
4,女,182,83,18


In [7]:
# 将第二行的height改为None
# 将第三行的体重改为Nan
df.loc[1, 'height'] = None
df.loc[2, 'weight'] = np.nan
df

Unnamed: 0,sex,height,weight,age
0,女,186.0,65.0,19
1,女,,94.0,24
2,女,180.0,,22
3,男,183.0,52.0,19
4,女,182.0,83.0,18


### 使用drop()将带有nan值的行或列删除

In [8]:
df.dropna()  # 默认情况下，按行删除，当行中存在任意一个nan值时，则会被删除

Unnamed: 0,sex,height,weight,age
0,女,186.0,65.0,19
3,男,183.0,52.0,19
4,女,182.0,83.0,18


In [None]:
df.dropna(axis=1, how='all')  # 按列删除，当列中所有的值都为Nan时，则被删除

In [10]:
df_2 = df.copy()
df_2

Unnamed: 0,sex,height,weight,age
0,女,186.0,65.0,19
1,女,,94.0,24
2,女,180.0,,22
3,男,183.0,52.0,19
4,女,182.0,83.0,18


In [13]:
# inplace 为 True表示会更新原有的数据，且函数返回None
df_2.dropna(axis='index', how='any', inplace=True)
df_2

Unnamed: 0,sex,height,weight,age
0,女,186.0,65.0,19
3,男,183.0,52.0,19
4,女,182.0,83.0,18


### fillna()填充

In [15]:
df.fillna(100)  # 为Nan值填充100

Unnamed: 0,sex,height,weight,age
0,女,186.0,65.0,19
1,女,100.0,94.0,24
2,女,180.0,100.0,22
3,男,183.0,52.0,19
4,女,182.0,83.0,18


In [17]:
# method = {pad, ffill, bfill}
# ffill -> forward fill_value   之前的值来填充nan
# bfill -> back fill_value      之后的值来填充nan
# axis 指定行或列的方向
df.fillna(method='ffill', axis=0)  # 按照行的前置填充

Unnamed: 0,sex,height,weight,age
0,女,186.0,65.0,19
1,女,186.0,94.0,24
2,女,180.0,94.0,22
3,男,183.0,52.0,19
4,女,182.0,83.0,18


In [18]:
df.fillna(method='bfill', axis=0)  # 按照行的前置填充

Unnamed: 0,sex,height,weight,age
0,女,186.0,65.0,19
1,女,180.0,94.0,24
2,女,180.0,52.0,22
3,男,183.0,52.0,19
4,女,182.0,83.0,18


In [19]:
df.loc[3, 'height'] = None
df

Unnamed: 0,sex,height,weight,age
0,女,186.0,65.0,19
1,女,,94.0,24
2,女,180.0,,22
3,男,,52.0,19
4,女,182.0,83.0,18


In [20]:
# limit 在连续出现的nan值填充过程中，限制填充次数
df.fillna(method='bfill', axis=0, limit=1)

Unnamed: 0,sex,height,weight,age
0,女,186.0,65.0,19
1,女,180.0,94.0,24
2,女,180.0,52.0,22
3,男,182.0,52.0,19
4,女,182.0,83.0,18


###  isnull()和notnull()在DataFrame中的使用

In [22]:
df.isnull().any(axis=0)  # 查看哪些列存在nan值

sex       False
height     True
weight     True
age       False
dtype: bool

In [22]:
# 查看存在nan值的列数据
df.loc[:, df.isnull().any(axis=0)]

sex       False
height     True
weight     True
age       False
dtype: bool

In [22]:
df.isnull().any(axis=1)  # 查看哪些行中存在nan值

sex       False
height     True
weight     True
age       False
dtype: bool

In [25]:
# 查看行中存在nan的数据
df[df.isnull().any(axis=1)]

Unnamed: 0,sex,height,weight,age
1,女,,94.0,24
2,女,180.0,,22
3,男,,52.0,19


In [27]:
# 删除重复行
df.loc[1] = df.loc[0]
df

Unnamed: 0,sex,height,weight,age
0,女,186.0,65.0,19
1,女,186.0,65.0,19
2,女,180.0,,22
3,男,,52.0,19
4,女,182.0,83.0,18


In [29]:
# keep = {'last' or 'first'}  default first
# first 保留重复数据的第一行
# last 保留重复数据的最后一行
df.drop_duplicates(keep='last')

Unnamed: 0,sex,height,weight,age
1,女,186.0,65.0,19
2,女,180.0,,22
3,男,,52.0,19
4,女,182.0,83.0,18
