# 数据清洗和准备

In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv(r'guazi.csv')
data

Unnamed: 0,leixing,nianfen,licheng,didian,shoujia,yuanjia
0,凯迪拉克ATS-L 2016款 28T 时尚型,2016年,2.5万公里,长沙,16.77万,34.60万
1,奥迪A6L 2014款 TFSI 标准型,2014年,13.8万公里,长沙,21.96万,44.50万
2,本田 思域 2016款 1.8L 自动舒适版,2016年,4.8万公里,长沙,8.87万,15.20万
3,大众 朗逸 2015款 1.6L 自动舒适版,2016年,10.5万公里,长沙,7.27万,14.90万
4,leixing,nianfen,licheng,didian,shoujia,yuanjia
5,凯迪拉克ATS-L 2016款 28T 时尚型,2016年,2.5万公里,长沙,16.77万,34.60万
6,奥迪A6L 2014款 TFSI 标准型,2014年,13.8万公里,长沙,21.96万,44.50万
7,本田 思域 2016款 1.8L 自动舒适版,2016年,4.8万公里,长沙,8.87万,15.20万
8,大众 朗逸 2015款 1.6L 自动舒适版,2016年,10.5万公里,长沙,7.27万,14.90万
9,leixing,nianfen,licheng,didian,shoujia,yuanjia


## 一、处理缺失数据
- 数据清洗是数据分析关键的一步，直接影响之后的处理工作
- 数据需要修改吗？有什么需要修改的吗？数据应该怎么调整才能适用于接下来的分析和挖掘？
- 是一个迭代的过程，实际项目中可能需要不止一次地执行这些清洗操作
- pd.fillna()
- pd.dropna()

In [3]:
data1 = pd.Series(['a', 'b', np.nan, 'd'])
data1

0      a
1      b
2    NaN
3      d
dtype: object

In [4]:
data1.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [5]:
data1[data1.isnull()]

2    NaN
dtype: object

In [6]:
data1[0] = None
data1.isnull()

0     True
1    False
2     True
3    False
dtype: bool

### 1.1 滤除缺失数据

In [7]:
data1.dropna()

1    b
3    d
dtype: object

In [8]:
data1[data1.notnull()]

1    b
3    d
dtype: object

In [9]:
data2 = pd.DataFrame([[1., 6.5, 3.], [1, np.nan, np.nan], [np.nan, np.nan, np.nan], [np.nan, 6.7, 7.]])
data2

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.7,7.0


In [10]:
data2.dropna()

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [11]:
# 丢弃全为nan的行，不是全为nan的则不会丢弃
data2.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.7,7.0


In [12]:
# 丢弃全为nan的列，不是全为nan的则不会丢弃
data2.dropna(how='all', axis=1)

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.7,7.0


In [13]:
df = pd.DataFrame(np.random.randn(7,3))
df

Unnamed: 0,0,1,2
0,-1.638823,-0.554539,-1.05143
1,1.239791,-0.30208,3.164534
2,-0.563026,-0.342345,0.301789
3,1.471114,-0.789268,-0.562442
4,0.107645,0.342451,-1.111017
5,0.492868,-2.205819,-0.151976
6,0.330407,0.263228,0.86437


In [14]:
df.iloc[:4, 1] = np.nan
df

Unnamed: 0,0,1,2
0,-1.638823,,-1.05143
1,1.239791,,3.164534
2,-0.563026,,0.301789
3,1.471114,,-0.562442
4,0.107645,0.342451,-1.111017
5,0.492868,-2.205819,-0.151976
6,0.330407,0.263228,0.86437


In [15]:
df.iloc[:2, 2] = np.nan
df

Unnamed: 0,0,1,2
0,-1.638823,,
1,1.239791,,
2,-0.563026,,0.301789
3,1.471114,,-0.562442
4,0.107645,0.342451,-1.111017
5,0.492868,-2.205819,-0.151976
6,0.330407,0.263228,0.86437


In [16]:
df.dropna()

Unnamed: 0,0,1,2
4,0.107645,0.342451,-1.111017
5,0.492868,-2.205819,-0.151976
6,0.330407,0.263228,0.86437


In [17]:
# thresh=n 删除掉缺失值数量为n的数据所对应的行(列)
df.dropna(thresh=2)

Unnamed: 0,0,1,2
2,-0.563026,,0.301789
3,1.471114,,-0.562442
4,0.107645,0.342451,-1.111017
5,0.492868,-2.205819,-0.151976
6,0.330407,0.263228,0.86437


### 1.2 填充数据

In [26]:
df.fillna(0)

Unnamed: 0,0,1,2
0,-1.638823,0.0,0.0
1,1.239791,0.0,0.0
2,-0.563026,0.0,0.301789
3,1.471114,0.0,-0.562442
4,0.107645,0.342451,-1.111017
5,0.492868,-2.205819,-0.151976
6,0.330407,0.263228,0.86437


In [28]:
df1 = df.fillna({1:0.9, 2:0.2})
df1

Unnamed: 0,0,1,2
0,-1.638823,0.9,0.2
1,1.239791,0.9,0.2
2,-0.563026,0.9,0.301789
3,1.471114,0.9,-0.562442
4,0.107645,0.342451,-1.111017
5,0.492868,-2.205819,-0.151976
6,0.330407,0.263228,0.86437


In [29]:
df2 = pd.DataFrame(np.random.randn(6,3))
df2

Unnamed: 0,0,1,2
0,-0.874981,0.501901,0.972177
1,-0.001418,-1.137341,-0.450748
2,1.479713,1.048862,1.767906
3,-0.160005,-1.541502,1.470752
4,-0.349701,2.39663,0.239148
5,-0.138928,0.647725,-1.074251


In [31]:
df2.iloc[2:, 1] = np.nan
df2.iloc[4:, 2] = np.nan
df2

Unnamed: 0,0,1,2
0,-0.874981,0.501901,0.972177
1,-0.001418,-1.137341,-0.450748
2,1.479713,,1.767906
3,-0.160005,,1.470752
4,-0.349701,,
5,-0.138928,,


In [33]:
# 把在nan数据值的前一个数据值复制给nan
df2.fillna(method='ffill')

Unnamed: 0,0,1,2
0,-0.874981,0.501901,0.972177
1,-0.001418,-1.137341,-0.450748
2,1.479713,-1.137341,1.767906
3,-0.160005,-1.137341,1.470752
4,-0.349701,-1.137341,1.470752
5,-0.138928,-1.137341,1.470752


In [36]:
# 对上面的无限复制进行限制
df2.fillna(method='ffill', limit=2)

Unnamed: 0,0,1,2
0,-0.874981,0.501901,0.972177
1,-0.001418,-1.137341,-0.450748
2,1.479713,-1.137341,1.767906
3,-0.160005,-1.137341,1.470752
4,-0.349701,,1.470752
5,-0.138928,,1.470752


### 例子

In [38]:
data.dropna()

Unnamed: 0,leixing,nianfen,licheng,didian,shoujia,yuanjia
0,凯迪拉克ATS-L 2016款 28T 时尚型,2016年,2.5万公里,长沙,16.77万,34.60万
1,奥迪A6L 2014款 TFSI 标准型,2014年,13.8万公里,长沙,21.96万,44.50万
2,本田 思域 2016款 1.8L 自动舒适版,2016年,4.8万公里,长沙,8.87万,15.20万
3,大众 朗逸 2015款 1.6L 自动舒适版,2016年,10.5万公里,长沙,7.27万,14.90万
4,leixing,nianfen,licheng,didian,shoujia,yuanjia
5,凯迪拉克ATS-L 2016款 28T 时尚型,2016年,2.5万公里,长沙,16.77万,34.60万
6,奥迪A6L 2014款 TFSI 标准型,2014年,13.8万公里,长沙,21.96万,44.50万
7,本田 思域 2016款 1.8L 自动舒适版,2016年,4.8万公里,长沙,8.87万,15.20万
8,大众 朗逸 2015款 1.6L 自动舒适版,2016年,10.5万公里,长沙,7.27万,14.90万
9,leixing,nianfen,licheng,didian,shoujia,yuanjia


## 二、数据转换

## 三、字符串操作