# 数据清洗和准备

In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv(r'guazi.csv')
data

Unnamed: 0,leixing,nianfen,licheng,didian,shoujia,yuanjia
0,凯迪拉克ATS-L 2016款 28T 时尚型,2016年,2.5万公里,长沙,16.77万,34.60万
1,奥迪A6L 2014款 TFSI 标准型,2014年,13.8万公里,长沙,21.96万,44.50万
2,本田 思域 2016款 1.8L 自动舒适版,2016年,4.8万公里,长沙,8.87万,15.20万
3,大众 朗逸 2015款 1.6L 自动舒适版,2016年,10.5万公里,长沙,7.27万,14.90万
4,leixing,nianfen,licheng,didian,shoujia,yuanjia
5,凯迪拉克ATS-L 2016款 28T 时尚型,2016年,2.5万公里,长沙,16.77万,34.60万
6,奥迪A6L 2014款 TFSI 标准型,2014年,13.8万公里,长沙,21.96万,44.50万
7,本田 思域 2016款 1.8L 自动舒适版,2016年,4.8万公里,长沙,8.87万,15.20万
8,大众 朗逸 2015款 1.6L 自动舒适版,2016年,10.5万公里,长沙,7.27万,14.90万
9,leixing,nianfen,licheng,didian,shoujia,yuanjia


## 一、处理缺失数据
- 数据清洗是数据分析关键的一步，直接影响之后的处理工作
- 数据需要修改吗？有什么需要修改的吗？数据应该怎么调整才能适用于接下来的分析和挖掘？
- 是一个迭代的过程，实际项目中可能需要不止一次地执行这些清洗操作
- pd.fillna()
- pd.dropna()

In [3]:
data1 = pd.Series(['a', 'b', np.nan, 'd'])
data1

0      a
1      b
2    NaN
3      d
dtype: object

In [4]:
data1.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [5]:
data1[data1.isnull()]

2    NaN
dtype: object

In [6]:
data1[0] = None
data1.isnull()

0     True
1    False
2     True
3    False
dtype: bool

### 1.1 滤除缺失数据

In [7]:
data1.dropna()

1    b
3    d
dtype: object

In [8]:
data1[data1.notnull()]

1    b
3    d
dtype: object

In [9]:
data2 = pd.DataFrame([[1., 6.5, 3.], [1, np.nan, np.nan], [np.nan, np.nan, np.nan], [np.nan, 6.7, 7.]])
data2

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.7,7.0


In [10]:
data2.dropna()

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [11]:
# 丢弃全为nan的行，不是全为nan的则不会丢弃
data2.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.7,7.0


In [12]:
# 丢弃全为nan的列，不是全为nan的则不会丢弃
data2.dropna(how='all', axis=1)

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.7,7.0


In [13]:
df = pd.DataFrame(np.random.randn(7,3))
df

Unnamed: 0,0,1,2
0,0.002366,0.68052,1.318554
1,0.450759,-0.531096,-0.087697
2,-0.531715,-0.292209,0.482542
3,-0.966777,0.353025,1.596803
4,1.789599,0.721754,-0.671818
5,0.742526,-0.017099,-1.436889
6,0.016634,-0.108316,0.417081


In [14]:
df.iloc[:4, 1] = np.nan
df

Unnamed: 0,0,1,2
0,0.002366,,1.318554
1,0.450759,,-0.087697
2,-0.531715,,0.482542
3,-0.966777,,1.596803
4,1.789599,0.721754,-0.671818
5,0.742526,-0.017099,-1.436889
6,0.016634,-0.108316,0.417081


In [15]:
df.iloc[:2, 2] = np.nan
df

Unnamed: 0,0,1,2
0,0.002366,,
1,0.450759,,
2,-0.531715,,0.482542
3,-0.966777,,1.596803
4,1.789599,0.721754,-0.671818
5,0.742526,-0.017099,-1.436889
6,0.016634,-0.108316,0.417081


In [16]:
df.dropna()

Unnamed: 0,0,1,2
4,1.789599,0.721754,-0.671818
5,0.742526,-0.017099,-1.436889
6,0.016634,-0.108316,0.417081


In [17]:
# thresh=n 删除掉缺失值数量为n的数据所对应的行(列)
df.dropna(thresh=2)

Unnamed: 0,0,1,2
2,-0.531715,,0.482542
3,-0.966777,,1.596803
4,1.789599,0.721754,-0.671818
5,0.742526,-0.017099,-1.436889
6,0.016634,-0.108316,0.417081


In [18]:
df.dropna(thresh=4, axis=1)

Unnamed: 0,0,2
0,0.002366,
1,0.450759,
2,-0.531715,0.482542
3,-0.966777,1.596803
4,1.789599,-0.671818
5,0.742526,-1.436889
6,0.016634,0.417081


### 1.2 填充数据

In [19]:
df.fillna(0)

Unnamed: 0,0,1,2
0,0.002366,0.0,0.0
1,0.450759,0.0,0.0
2,-0.531715,0.0,0.482542
3,-0.966777,0.0,1.596803
4,1.789599,0.721754,-0.671818
5,0.742526,-0.017099,-1.436889
6,0.016634,-0.108316,0.417081


In [20]:
df1 = df.fillna({1:0.9, 2:0.2})
df1

Unnamed: 0,0,1,2
0,0.002366,0.9,0.2
1,0.450759,0.9,0.2
2,-0.531715,0.9,0.482542
3,-0.966777,0.9,1.596803
4,1.789599,0.721754,-0.671818
5,0.742526,-0.017099,-1.436889
6,0.016634,-0.108316,0.417081


In [21]:
df2 = pd.DataFrame(np.random.randn(6,3))
df2

Unnamed: 0,0,1,2
0,0.964485,0.664996,-0.468481
1,0.452549,0.697639,0.818415
2,0.751829,0.163571,-1.469237
3,1.70309,0.959652,-1.129748
4,1.207504,0.51582,0.107516
5,1.001196,2.292856,-1.139729


In [22]:
df2.iloc[2:, 1] = np.nan
df2.iloc[4:, 2] = np.nan
df2

Unnamed: 0,0,1,2
0,0.964485,0.664996,-0.468481
1,0.452549,0.697639,0.818415
2,0.751829,,-1.469237
3,1.70309,,-1.129748
4,1.207504,,
5,1.001196,,


In [23]:
# 把在nan数据值的前一个数据值复制给nan
df2.fillna(method='ffill')

Unnamed: 0,0,1,2
0,0.964485,0.664996,-0.468481
1,0.452549,0.697639,0.818415
2,0.751829,0.697639,-1.469237
3,1.70309,0.697639,-1.129748
4,1.207504,0.697639,-1.129748
5,1.001196,0.697639,-1.129748


In [24]:
# 对上面的无限复制进行限制
df2.fillna(method='ffill', limit=1)

Unnamed: 0,0,1,2
0,0.964485,0.664996,-0.468481
1,0.452549,0.697639,0.818415
2,0.751829,0.697639,-1.469237
3,1.70309,,-1.129748
4,1.207504,,-1.129748
5,1.001196,,


### 例子

In [25]:
data.dropna()

Unnamed: 0,leixing,nianfen,licheng,didian,shoujia,yuanjia
0,凯迪拉克ATS-L 2016款 28T 时尚型,2016年,2.5万公里,长沙,16.77万,34.60万
1,奥迪A6L 2014款 TFSI 标准型,2014年,13.8万公里,长沙,21.96万,44.50万
2,本田 思域 2016款 1.8L 自动舒适版,2016年,4.8万公里,长沙,8.87万,15.20万
3,大众 朗逸 2015款 1.6L 自动舒适版,2016年,10.5万公里,长沙,7.27万,14.90万
4,leixing,nianfen,licheng,didian,shoujia,yuanjia
5,凯迪拉克ATS-L 2016款 28T 时尚型,2016年,2.5万公里,长沙,16.77万,34.60万
6,奥迪A6L 2014款 TFSI 标准型,2014年,13.8万公里,长沙,21.96万,44.50万
7,本田 思域 2016款 1.8L 自动舒适版,2016年,4.8万公里,长沙,8.87万,15.20万
8,大众 朗逸 2015款 1.6L 自动舒适版,2016年,10.5万公里,长沙,7.27万,14.90万
9,leixing,nianfen,licheng,didian,shoujia,yuanjia


## 二、数据转换

### 移除重复数据

In [26]:
data = pd.DataFrame({'k1':['one', 'two'] * 3 + ['two'],
                    'k2':[1,1,2,3,3,4,4]})
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [27]:
# 检查数据是否重复 返回布尔类型
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [28]:
# 会把重复的数据删除后返回新的对象，默认保留第一次出现的数据
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [29]:
data['v1'] = np.arange(7)
data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


In [30]:
# 按照某一列内的数据是否重复进行过滤
data.drop_duplicates(['k1'])

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


In [31]:
# 保留最后一个重复的数据
data.drop_duplicates(['k1'], keep='last')

Unnamed: 0,k1,k2,v1
4,one,3,4
6,two,4,6


### 利用函数或映射进行数据转换

In [32]:
data = pd.DataFrame({"food":["Apple", "banana", 'orange', 'apple', 'Mango', 'tomato'],
                    'price':[4, 3, 3.5, 6, 12, 3]})
data

Unnamed: 0,food,price
0,Apple,4.0
1,banana,3.0
2,orange,3.5
3,apple,6.0
4,Mango,12.0
5,tomato,3.0


In [33]:
meat = {'apple':'fruit', 
        'banana':'fruit',
       'orange':'fruit',
       'mango':'fruit',
       'tomato':'vegetable'}

In [34]:
# 把字符串转化成全部小写格式
low = data['food'].str.lower()
low

0     apple
1    banana
2    orange
3     apple
4     mango
5    tomato
Name: food, dtype: object

In [35]:
# 映射
data['class'] = data['food'].str.lower().map(meat)
data

Unnamed: 0,food,price,class
0,Apple,4.0,fruit
1,banana,3.0,fruit
2,orange,3.5,fruit
3,apple,6.0,fruit
4,Mango,12.0,fruit
5,tomato,3.0,vegetable


In [36]:
# 函数
data['class1'] = data['food'].map(lambda x: meat[x.lower()])
data

Unnamed: 0,food,price,class,class1
0,Apple,4.0,fruit,fruit
1,banana,3.0,fruit,fruit
2,orange,3.5,fruit,fruit
3,apple,6.0,fruit,fruit
4,Mango,12.0,fruit,fruit
5,tomato,3.0,vegetable,vegetable


### 替换值

In [37]:
data = pd.Series([1, -9999, 2, -1000, 3])
data

0       1
1   -9999
2       2
3   -1000
4       3
dtype: int64

In [38]:
# 替换时生成新的对象
data.replace(-9999, np.nan)

0       1.0
1       NaN
2       2.0
3   -1000.0
4       3.0
dtype: float64

In [39]:
data.replace([-9999, -1000], [np.nan, 0])

0    1.0
1    NaN
2    2.0
3    0.0
4    3.0
dtype: float64

In [40]:
data.replace({-9999:np.nan, -1000:0})

0    1.0
1    NaN
2    2.0
3    0.0
4    3.0
dtype: float64

In [41]:
data

0       1
1   -9999
2       2
3   -1000
4       3
dtype: int64

#### data.replace 和 data.str.replace 是不一样的，注意区分

### 重命名轴索引

In [42]:
data = pd.DataFrame(np.arange(12).reshape(3,4),
                   index=['BeiJing', 'Tokyo', 'New York'],
                   columns=['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
BeiJing,0,1,2,3
Tokyo,4,5,6,7
New York,8,9,10,11


In [43]:
# 重新索引reindex 产生新的对象
data.reindex(['Tokyo', 'New York', 'BeiJing'])

Unnamed: 0,one,two,three,four
Tokyo,4,5,6,7
New York,8,9,10,11
BeiJing,0,1,2,3


In [44]:
data

Unnamed: 0,one,two,three,four
BeiJing,0,1,2,3
Tokyo,4,5,6,7
New York,8,9,10,11


In [45]:
# 全大写
tran = lambda x: x[:4].upper()
data.index.map(tran)

Index(['BEIJ', 'TOKY', 'NEW '], dtype='object')

In [46]:
# 在原索引的基础上进行简单修改，不会对数据值产生影响
data.index = data.index.map(tran)
data

Unnamed: 0,one,two,three,four
BEIJ,0,1,2,3
TOKY,4,5,6,7
NEW,8,9,10,11


In [47]:
col = lambda x: x[:5].title()
data.columns = data.columns.map(col)

In [48]:
data

Unnamed: 0,One,Two,Three,Four
BEIJ,0,1,2,3
TOKY,4,5,6,7
NEW,8,9,10,11


In [49]:
# rename 更新索引格式
data.rename(index=str.title, columns=str.upper)

Unnamed: 0,ONE,TWO,THREE,FOUR
Beij,0,1,2,3
Toky,4,5,6,7
New,8,9,10,11


In [50]:
# 结合字典型对象对标签进行更改
data.rename(index={'BEIJ':"北京", 'TOKY':'东京', 'NEW ':'纽约'}, columns={'One':1, 'Two':2, 'Three':3, 'Four':4})

Unnamed: 0,1,2,3,4
北京,0,1,2,3
东京,4,5,6,7
纽约,8,9,10,11


In [51]:
data

Unnamed: 0,One,Two,Three,Four
BEIJ,0,1,2,3
TOKY,4,5,6,7
NEW,8,9,10,11


In [53]:
# 使用inplace参数可以就地修改
data.rename(index={'BEIJ':"北京", 'TOKY':'东京', 'NEW ':'纽约'}, 
            columns={'One':1, 'Two':2, 'Three':3, 'Four':4}, 
            inplace=True)
data

Unnamed: 0,1,2,3,4
北京,0,1,2,3
东京,4,5,6,7
纽约,8,9,10,11


### 离散化和面元划分 ------ 分阶段 

In [55]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

In [56]:
# 面元 bin
bins = [18, 25, 35, 60, 100]

In [57]:
# cut划分
cats = pd.cut(ages, bins)
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [61]:
# 底层是数组
cats.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [62]:
cats.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]]
              closed='right',
              dtype='interval[int64]')

In [63]:
pd.value_counts(cats)

(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64

In [72]:
pd.cut(ages, [18, 26, 36, 61, 100], right=False)

[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, interval[int64]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]

In [71]:
names = ['青年', '成年', '中年', '老年']
pd.cut(ages, bins, labels=names)

[青年, 青年, 青年, 成年, 青年, ..., 成年, 老年, 中年, 中年, 成年]
Length: 12
Categories (4, object): [青年 < 成年 < 中年 < 老年]

In [69]:
data = np.random.rand(20)
data

array([0.48044552, 0.62719402, 0.52347232, 0.56206366, 0.75355543,
       0.44163053, 0.21848816, 0.39637534, 0.76544515, 0.93174613,
       0.24598168, 0.04264067, 0.51434597, 0.47342487, 0.68563908,
       0.38125591, 0.12734189, 0.89329135, 0.31164574, 0.72250909])

In [77]:
# precision精度
pd.cut(data, 4, precision=2)

[(0.04, 1.59], (0.04, 1.59], (-1.51, 0.04], (-1.51, 0.04], (0.04, 1.59], ..., (-1.51, 0.04], (-1.51, 0.04], (0.04, 1.59], (-1.51, 0.04], (0.04, 1.59]]
Length: 1000
Categories (4, interval[float64]): [(-3.06, -1.51] < (-1.51, 0.04] < (0.04, 1.59] < (1.59, 3.14]]

In [78]:
pd.value_counts(pd.cut(data, 4, precision=2))

(-1.51, 0.04]     460
(0.04, 1.59]      414
(-3.06, -1.51]     65
(1.59, 3.14]       61
dtype: int64

In [74]:
# qcut函数
data = np.random.randn(1000)
data

array([ 2.92837440e-01,  1.04602392e+00, -6.58337587e-01, -2.01510642e-01,
        4.92638060e-01,  1.05108092e-01, -4.94151776e-01, -8.22661316e-01,
       -3.30884483e-01,  7.27654183e-02, -2.80132468e-01,  1.33808443e-01,
        4.75808497e-01,  4.55031870e-01,  3.59217401e-01,  2.03047693e+00,
        3.14217466e-01, -5.42672299e-01,  1.62747628e-01,  2.12239168e-01,
       -5.15659679e-01,  3.73626058e-01, -1.03156801e+00,  7.17238116e-01,
       -3.75617360e-01, -1.23615485e+00,  1.22134634e+00,  1.05328917e+00,
        1.29138874e+00, -7.64030193e-01, -1.70792942e+00, -9.95791590e-01,
        1.12212847e+00,  1.08717133e+00, -3.08530753e-01,  8.00805193e-01,
       -3.19951167e-01,  1.18427008e+00, -4.84373773e-01,  6.20855130e-01,
        2.13304902e+00,  1.86066918e+00, -2.40323119e-01,  9.07329709e-01,
        2.15998041e+00,  2.63865113e-01, -9.60694655e-01,  2.99363534e-01,
       -5.97998436e-01, -1.36081642e-01, -3.98839992e-01,  1.68087315e+00,
       -3.51083829e-01,  

In [75]:
cat1 = pd.qcut(data, 4)
cat1

[(-0.0337, 0.672], (0.672, 3.135], (-0.715, -0.0337], (-0.715, -0.0337], (-0.0337, 0.672], ..., (-0.715, -0.0337], (-3.0549999999999997, -0.715], (0.672, 3.135], (-0.715, -0.0337], (0.672, 3.135]]
Length: 1000
Categories (4, interval[float64]): [(-3.0549999999999997, -0.715] < (-0.715, -0.0337] < (-0.0337, 0.672] < (0.672, 3.135]]

In [76]:
pd.value_counts(cat1)

(0.672, 3.135]                   250
(-0.0337, 0.672]                 250
(-0.715, -0.0337]                250
(-3.0549999999999997, -0.715]    250
dtype: int64

In [83]:
cat2 = pd.qcut(data, [0, 0.1, 0.5, 0.9, 1])

In [85]:
pd.value_counts(cat2)

(-0.0337, 1.277]                 400
(-1.238, -0.0337]                400
(1.277, 3.135]                   100
(-3.0549999999999997, -1.238]    100
dtype: int64

### 检测和过滤异常值

In [98]:
data = pd.DataFrame(np.random.randn(1000, 4))
data

Unnamed: 0,0,1,2,3
0,-0.501320,0.011723,-1.062734,-0.050167
1,0.061326,1.916367,0.625497,0.653728
2,1.036834,-0.356520,-0.063509,0.059737
3,1.279888,-0.269750,-0.151261,-1.159670
4,-1.992019,1.233195,0.160540,1.819094
5,-0.300148,0.092512,-0.657771,0.192089
6,-0.012800,-0.050541,0.892240,0.588557
7,-0.696965,0.433229,0.763913,-1.220652
8,2.187642,0.990101,1.439428,-0.182735
9,-0.707561,0.122372,-0.814302,0.515403


In [99]:
# 捡出异常值所在的行，只要有就检出来
data[(np.abs(data)>3).any(1)]

Unnamed: 0,0,1,2,3
128,-3.232628,0.507787,-1.133553,-1.44463
294,0.593399,3.497413,-0.763357,-0.185109
307,-0.969554,3.592237,-0.453851,-1.649381
417,3.430733,0.55272,-0.816726,-0.342587
527,-3.473753,1.869546,1.751831,0.130971
544,0.336632,-0.768222,3.107723,0.74932
550,-3.227648,0.184382,-0.305403,0.074898
611,-3.01092,0.387229,-1.408967,0.724033
785,0.731701,-0.474553,0.2707,3.854677
853,-0.877505,-3.519648,0.377464,1.174835


In [93]:
data[np.abs(data)>3] = 3

In [95]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.028644,-0.044775,0.043361,0.001406
std,0.99229,1.062641,0.979624,0.997517
min,-2.833905,-2.966368,-2.595829,-2.822603
25%,-0.672613,-0.804625,-0.575855,-0.702495
50%,0.0231,-0.004307,0.098971,0.013076
75%,0.741554,0.696829,0.690729,0.689555
max,3.0,3.0,3.0,3.0


### 排列和随机采样

In [100]:
df = pd.DataFrame(np.arange(5 * 4).reshape((5,4)))
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [117]:
# np.random.permutation(n) 生成随机排列0-n的数组
sam = np.random.permutation(5)
sam

array([0, 3, 2, 1, 4])

In [118]:
# 重新排列行索引
df.take(sam)

Unnamed: 0,0,1,2,3
0,0,1,2,3
3,12,13,14,15
2,8,9,10,11
1,4,5,6,7
4,16,17,18,19


In [119]:
# 可以取前n行数据，但排列顺序随机
df.take(np.random.permutation(3))

Unnamed: 0,0,1,2,3
2,8,9,10,11
0,0,1,2,3
1,4,5,6,7


In [146]:
# 随机取n行数据
df.sample(n=3)

Unnamed: 0,0,1,2,3
3,12,13,14,15
4,16,17,18,19
2,8,9,10,11


In [147]:
ch = pd.Series([5,7,1,6,3])
ch

0    5
1    7
2    1
3    6
4    3
dtype: int64

In [154]:
# 可以添加参数进行重复多次选择
ch.sample(n=10, replace=True)

1    7
3    6
3    6
2    1
2    1
0    5
4    3
1    7
2    1
3    6
dtype: int64

## 三、字符串操作

### 字符串对象方法

In [156]:
val = 'a,b, c'

In [162]:
# 分割
val.split(',')

['a', 'b', ' c']

In [167]:
# 去掉空白符(包括\n)
p = [x.strip() for x in val.split(',')]
p

['a', 'b', 'c']

In [169]:
# 字符串拼接
f,s,t = p
f +'::' + s + '::' + t

'a::b::c'

In [170]:
# join()
'::'.join(p)

'a::b::c'

In [171]:
'c' in p

True

In [172]:
val.index(',')

1

In [173]:
val.find(':')

-1

In [175]:
val.replace(',', '::')

'a::b:: c'

### 正则表达式

In [176]:
import re

In [179]:
text = 'foo  bar\t bat  \tqq'
text

'foo  bar\t bat  \tqq'

In [178]:
re.split('\s+', text)

['foo', 'bar', 'bat', 'qq']

In [182]:
# re.compile() 将正则表达式编译成一个对象，以后也可以直接使用
res = re.compile('\s+')
res.split(text)

['foo', 'bar', 'bat', 'qq']

In [184]:
reg = re.split(res, text)
reg

['foo', 'bar', 'bat', 'qq']

In [185]:
# .findall() 查找匹配所有
res.findall(text)

['  ', '\t ', '  \t']

In [186]:
re.findall(res, text)

['  ', '\t ', '  \t']

In [187]:
# 替换
re.sub(res, '9', text)

'foo9bar9bat9qq'

In [188]:
res.sub('9', text)

'foo9bar9bat9qq'

In [191]:
# 只匹配起始字符
t1 = re.match('b', text)
print(t1)

None


In [195]:
# 匹配全体字符，但只要找到一个就返回匹配项对象，后面还有没有也不管啦
t2 = re.search('b', text)
print(t2.group())

b


### pandas矢量化字符串函数

In [197]:
data = {'a':'dave@qq.com', 'b':'stave@gmail.com',
       'c':'sam@gmail.com', 'd':np.nan}
data = pd.Series(data)
data

a        dave@qq.com
b    stave@gmail.com
c      sam@gmail.com
d                NaN
dtype: object

In [198]:
data.isnull()

a    False
b    False
c    False
d     True
dtype: bool

In [206]:
# 因为有nan浮点类型，直接使用split就会报错
# data.map(lambda x: x.split('@'))

In [201]:
data.str.contains('gmail')

a    False
b     True
c     True
d      NaN
dtype: object

In [202]:
data.str.split('@')

a        [dave, qq.com]
b    [stave, gmail.com]
c      [sam, gmail.com]
d                   NaN
dtype: object

In [205]:
data.str.findall('@')

a    [@]
b    [@]
c    [@]
d    NaN
dtype: object

In [1]:
data.str[:5]

NameError: name 'data' is not defined