#  <font color=red> Module_06_資料整理與前處理</font>

## 如何處理資料遺漏

### 先建立一個有遺漏值的資料框

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import re

df = pd.DataFrame(np.arange(0, 15).reshape(5, 3),
                 index = ['a', 'b', 'c', 'd', 'e'],
                 columns = ['c1', 'c2', 'c3'])
df

Unnamed: 0,c1,c2,c3
a,0,1,2
b,3,4,5
c,6,7,8
d,9,10,11
e,12,13,14


In [2]:
# pandas 使用了浮點數值 NaN (Not a Number) 來代表遺失資料，我們稱這個易於辨識的值為標記值
df['c4'] = np.nan

In [3]:
df.loc['f'] = np.arange(15, 19)

In [4]:
df

Unnamed: 0,c1,c2,c3,c4
a,0,1,2,
b,3,4,5,
c,6,7,8,
d,9,10,11,
e,12,13,14,
f,15,16,17,18.0


In [5]:
df.loc['a']['c4']

nan

In [6]:
type(df.loc['a']['c4'])

numpy.float64

---

In [7]:
df.loc['g'] = np.nan
df['c5'] = np.nan

In [8]:
df

Unnamed: 0,c1,c2,c3,c4,c5
a,0.0,1.0,2.0,,
b,3.0,4.0,5.0,,
c,6.0,7.0,8.0,,
d,9.0,10.0,11.0,,
e,12.0,13.0,14.0,,
f,15.0,16.0,17.0,18.0,
g,,,,,


In [9]:
df['c4']['a'] = 20

In [10]:
df

Unnamed: 0,c1,c2,c3,c4,c5
a,0.0,1.0,2.0,20.0,
b,3.0,4.0,5.0,,
c,6.0,7.0,8.0,,
d,9.0,10.0,11.0,,
e,12.0,13.0,14.0,,
f,15.0,16.0,17.0,18.0,
g,,,,,


### 判定 pandas 物件裡的 NaN 值

In [11]:
df

Unnamed: 0,c1,c2,c3,c4,c5
a,0.0,1.0,2.0,20.0,
b,3.0,4.0,5.0,,
c,6.0,7.0,8.0,,
d,9.0,10.0,11.0,,
e,12.0,13.0,14.0,,
f,15.0,16.0,17.0,18.0,
g,,,,,


In [12]:
# 返回還是資料框
df.isnull()

Unnamed: 0,c1,c2,c3,c4,c5
a,False,False,False,False,True
b,False,False,False,True,True
c,False,False,False,True,True
d,False,False,False,True,True
e,False,False,False,True,True
f,False,False,False,False,True
g,True,True,True,True,True


In [13]:
# 注意在 python 中，NaN代表遺失值，它是 float 資料型態，重點是 np.nan != np.nan
# 但是 None = None
# 下面的方法會得到不是我們要的結果
df == np.nan 

Unnamed: 0,c1,c2,c3,c4,c5
a,False,False,False,False,False
b,False,False,False,False,False
c,False,False,False,False,False
d,False,False,False,False,False
e,False,False,False,False,False
f,False,False,False,False,False
g,False,False,False,False,False


In [14]:
# 每個欄位有幾個遺失值
df.isnull().sum()

c1    1
c2    1
c3    1
c4    5
c5    7
dtype: int64

In [15]:
df.isnull().sum(axis = 1)

a    1
b    2
c    2
d    2
e    2
f    1
g    5
dtype: int64

In [16]:
# 資料框內總共有幾個遺失值
df.isnull().sum().sum() 

15

---

In [17]:
# 資料框的 .count() 方法是計算非 NaN 的個數，預設 axis = 0
df.count() 

c1    6
c2    6
c3    6
c4    2
c5    0
dtype: int64

In [18]:
# 也可以得到資料框內總共有幾個遺失值
(len(df) - df.count()).sum() 

15

---

In [19]:
# 反過來問，還是返回資料框
df.notnull() 

Unnamed: 0,c1,c2,c3,c4,c5
a,True,True,True,True,False
b,True,True,True,False,False
c,True,True,True,False,False
d,True,True,True,False,False
e,True,True,True,False,False
f,True,True,True,True,False
g,False,False,False,False,False


In [20]:
df.notnull().sum().sum()

20

In [21]:
df.size

35

### 排除遺漏的資料項

In [22]:
df

Unnamed: 0,c1,c2,c3,c4,c5
a,0.0,1.0,2.0,20.0,
b,3.0,4.0,5.0,,
c,6.0,7.0,8.0,,
d,9.0,10.0,11.0,,
e,12.0,13.0,14.0,,
f,15.0,16.0,17.0,18.0,
g,,,,,


In [23]:
# 看看 c4 行的非 NaN 的值
# 布林選擇
df.c4[df.c4.notnull()] 

a    20.0
f    18.0
Name: c4, dtype: float64

In [24]:
# 使用序列的 .dropna() 得到同樣的效果
 # not in-place
df.c4.dropna()

a    20.0
f    18.0
Name: c4, dtype: float64

In [25]:
df.c4

a    20.0
b     NaN
c     NaN
d     NaN
e     NaN
f    18.0
g     NaN
Name: c4, dtype: float64

In [26]:
# 資料框的 .dropna() 方法
# 預設 axis = 0 and how = 'any'，也就是只要有出 NaN 就把那一列刪了 # not in-place
df.dropna() 

Unnamed: 0,c1,c2,c3,c4,c5


In [27]:
# 要整列都是 NaN 才會刪掉
df.dropna(how = 'all')

Unnamed: 0,c1,c2,c3,c4,c5
a,0.0,1.0,2.0,20.0,
b,3.0,4.0,5.0,,
c,6.0,7.0,8.0,,
d,9.0,10.0,11.0,,
e,12.0,13.0,14.0,,
f,15.0,16.0,17.0,18.0,


In [28]:
# 先把整列都是 NaN 的列刪了，再把欄只要有  NaN 的也都刪了!
df.dropna(how = 'all').dropna(axis = 1, how = 'any')

Unnamed: 0,c1,c2,c3
a,0.0,1.0,2.0
b,3.0,4.0,5.0
c,6.0,7.0,8.0
d,9.0,10.0,11.0
e,12.0,13.0,14.0
f,15.0,16.0,17.0


In [29]:
df

Unnamed: 0,c1,c2,c3,c4,c5
a,0.0,1.0,2.0,20.0,
b,3.0,4.0,5.0,,
c,6.0,7.0,8.0,,
d,9.0,10.0,11.0,,
e,12.0,13.0,14.0,,
f,15.0,16.0,17.0,18.0,
g,,,,,


In [30]:
df.dropna(axis = 1, how = 'all')

Unnamed: 0,c1,c2,c3,c4
a,0.0,1.0,2.0,20.0
b,3.0,4.0,5.0,
c,6.0,7.0,8.0,
d,9.0,10.0,11.0,
e,12.0,13.0,14.0,
f,15.0,16.0,17.0,18.0
g,,,,


---

In [31]:
df2 = df.copy()
df2

Unnamed: 0,c1,c2,c3,c4,c5
a,0.0,1.0,2.0,20.0,
b,3.0,4.0,5.0,,
c,6.0,7.0,8.0,,
d,9.0,10.0,11.0,,
e,12.0,13.0,14.0,,
f,15.0,16.0,17.0,18.0,
g,,,,,


In [32]:
df2.loc['g']['c1'] = 0
df2.loc['g']['c3'] = 0

In [33]:
df2

Unnamed: 0,c1,c2,c3,c4,c5
a,0.0,1.0,2.0,20.0,
b,3.0,4.0,5.0,,
c,6.0,7.0,8.0,,
d,9.0,10.0,11.0,,
e,12.0,13.0,14.0,,
f,15.0,16.0,17.0,18.0,
g,0.0,,0.0,,


In [34]:
df2.dropna(axis = 1, how = 'any', inplace = True)

In [35]:
df2

Unnamed: 0,c1,c3
a,0.0,2.0
b,3.0,5.0
c,6.0,8.0
d,9.0,11.0
e,12.0,14.0
f,15.0,17.0
g,0.0,0.0


---

In [36]:
df

Unnamed: 0,c1,c2,c3,c4,c5
a,0.0,1.0,2.0,20.0,
b,3.0,4.0,5.0,,
c,6.0,7.0,8.0,,
d,9.0,10.0,11.0,,
e,12.0,13.0,14.0,,
f,15.0,16.0,17.0,18.0,
g,,,,,


In [37]:
# thresh 參數就是 threshold ，臨界點的意思
# 至少三個值才不會被捨棄
df.dropna(thresh = 3, axis = 1)

Unnamed: 0,c1,c2,c3
a,0.0,1.0,2.0
b,3.0,4.0,5.0
c,6.0,7.0,8.0
d,9.0,10.0,11.0
e,12.0,13.0,14.0
f,15.0,16.0,17.0
g,,,


### 在數學運算中處理 NaN 值

In [38]:
a = np.array([1, 2, np.nan, 3])
b = pd.Series(a)
a

array([ 1.,  2., nan,  3.])

In [39]:
b

0    1.0
1    2.0
2    NaN
3    3.0
dtype: float64

In [40]:
a.mean() # Numpy 函式與方法遇到 NaN 通常會回傳 NaN

nan

In [41]:
b.mean() # pandas 函式與方法通常會忽略 NaN # 所以答案是 (1+2+3)/3 = 2

2.0

---

In [42]:
s = df.c4
s

a    20.0
b     NaN
c     NaN
d     NaN
e     NaN
f    18.0
g     NaN
Name: c4, dtype: float64

In [43]:
s.sum()

38.0

In [44]:
s.mean()

19.0

In [45]:
# 就算是累積總和的方法 .cumsum() 也是會忽略遺失值
s.cumsum()

a    20.0
b     NaN
c     NaN
d     NaN
e     NaN
f    38.0
g     NaN
Name: c4, dtype: float64

In [46]:
df.c4 + 1 # 但如果是傳統的數學運算子，NaN 會傳遞到最終的結果

a    21.0
b     NaN
c     NaN
d     NaN
e     NaN
f    19.0
g     NaN
Name: c4, dtype: float64

### 填入遺漏的資料

In [47]:
df

Unnamed: 0,c1,c2,c3,c4,c5
a,0.0,1.0,2.0,20.0,
b,3.0,4.0,5.0,,
c,6.0,7.0,8.0,,
d,9.0,10.0,11.0,,
e,12.0,13.0,14.0,,
f,15.0,16.0,17.0,18.0,
g,,,,,


In [48]:
# 只要是遺失值的都填入 0
# not in-place
filled = df.fillna(0) 
filled

Unnamed: 0,c1,c2,c3,c4,c5
a,0.0,1.0,2.0,20.0,0.0
b,3.0,4.0,5.0,0.0,0.0
c,6.0,7.0,8.0,0.0,0.0
d,9.0,10.0,11.0,0.0,0.0
e,12.0,13.0,14.0,0.0,0.0
f,15.0,16.0,17.0,18.0,0.0
g,0.0,0.0,0.0,0.0,0.0


In [49]:
df

Unnamed: 0,c1,c2,c3,c4,c5
a,0.0,1.0,2.0,20.0,
b,3.0,4.0,5.0,,
c,6.0,7.0,8.0,,
d,9.0,10.0,11.0,,
e,12.0,13.0,14.0,,
f,15.0,16.0,17.0,18.0,
g,,,,,


In [50]:
df.mean() 

c1     7.5
c2     8.5
c3     9.5
c4    19.0
c5     NaN
dtype: float64

In [51]:
# 注意!! 填完遺失值的數學統計量跟未填入算出來會產生不同的結果
# 遺失值如果沒有被填入，在操作 pandas 的方法時通常會被忽略
filled.mean()

c1    6.428571
c2    7.285714
c3    8.142857
c4    5.428571
c5    0.000000
dtype: float64

---

In [52]:
df2 = df.copy()
df2

Unnamed: 0,c1,c2,c3,c4,c5
a,0.0,1.0,2.0,20.0,
b,3.0,4.0,5.0,,
c,6.0,7.0,8.0,,
d,9.0,10.0,11.0,,
e,12.0,13.0,14.0,,
f,15.0,16.0,17.0,18.0,
g,,,,,


In [53]:
# 可以給個字典，給出哪個欄位的遺失值要填入哪個值
# not in-place
df2.fillna({'c2': 0, 'c4': 100})

Unnamed: 0,c1,c2,c3,c4,c5
a,0.0,1.0,2.0,20.0,
b,3.0,4.0,5.0,100.0,
c,6.0,7.0,8.0,100.0,
d,9.0,10.0,11.0,100.0,
e,12.0,13.0,14.0,100.0,
f,15.0,16.0,17.0,18.0,
g,,0.0,,100.0,


In [54]:
df2

Unnamed: 0,c1,c2,c3,c4,c5
a,0.0,1.0,2.0,20.0,
b,3.0,4.0,5.0,,
c,6.0,7.0,8.0,,
d,9.0,10.0,11.0,,
e,12.0,13.0,14.0,,
f,15.0,16.0,17.0,18.0,
g,,,,,


In [55]:
df2.fillna({'c2': 0, 'c4': 100}, inplace = True)
df2

Unnamed: 0,c1,c2,c3,c4,c5
a,0.0,1.0,2.0,20.0,
b,3.0,4.0,5.0,100.0,
c,6.0,7.0,8.0,100.0,
d,9.0,10.0,11.0,100.0,
e,12.0,13.0,14.0,100.0,
f,15.0,16.0,17.0,18.0,
g,,0.0,,100.0,


### 以向前及向後方式填充遺漏值

In [56]:
df.c4

a    20.0
b     NaN
c     NaN
d     NaN
e     NaN
f    18.0
g     NaN
Name: c4, dtype: float64

In [57]:
# 在處理時間序列時常用的方法
df.c4.fillna(method = 'ffill') 

a    20.0
b    20.0
c    20.0
d    20.0
e    20.0
f    18.0
g    18.0
Name: c4, dtype: float64

In [58]:
 # 也有 pd.ffill() 與 pd.bfill() 全域函式可以使用
df.c4.fillna(method = 'bfill')

a    20.0
b    18.0
c    18.0
d    18.0
e    18.0
f    18.0
g     NaN
Name: c4, dtype: float64

---

In [59]:
df

Unnamed: 0,c1,c2,c3,c4,c5
a,0.0,1.0,2.0,20.0,
b,3.0,4.0,5.0,,
c,6.0,7.0,8.0,,
d,9.0,10.0,11.0,,
e,12.0,13.0,14.0,,
f,15.0,16.0,17.0,18.0,
g,,,,,


In [60]:
# 在資料框上也是類似的效果
df.fillna(method = 'ffill')

Unnamed: 0,c1,c2,c3,c4,c5
a,0.0,1.0,2.0,20.0,
b,3.0,4.0,5.0,20.0,
c,6.0,7.0,8.0,20.0,
d,9.0,10.0,11.0,20.0,
e,12.0,13.0,14.0,20.0,
f,15.0,16.0,17.0,18.0,
g,15.0,16.0,17.0,18.0,


In [61]:
# 參數 limit 可以限制最多填入幾次遺失值
df.fillna(method = 'ffill', axis = 1, limit = 1)

Unnamed: 0,c1,c2,c3,c4,c5
a,0.0,1.0,2.0,20.0,20.0
b,3.0,4.0,5.0,5.0,
c,6.0,7.0,8.0,8.0,
d,9.0,10.0,11.0,11.0,
e,12.0,13.0,14.0,14.0,
f,15.0,16.0,17.0,18.0,18.0
g,,,,,


### 利用索引標籤填值

In [62]:
df

Unnamed: 0,c1,c2,c3,c4,c5
a,0.0,1.0,2.0,20.0,
b,3.0,4.0,5.0,,
c,6.0,7.0,8.0,,
d,9.0,10.0,11.0,,
e,12.0,13.0,14.0,,
f,15.0,16.0,17.0,18.0,
g,,,,,


In [63]:
fill_valued = pd.Series([100, 101, 102], index = ['a', 'e', 'g'])
fill_valued

a    100
e    101
g    102
dtype: int64

In [64]:
df.c4

a    20.0
b     NaN
c     NaN
d     NaN
e     NaN
f    18.0
g     NaN
Name: c4, dtype: float64

In [65]:
# 只有 NaN 的項目才能被擴充取代，注意標籤 a 的值沒變
# 裡面的 fill_valued 也可以是字典
df.c4.fillna(fill_valued) 

a     20.0
b      NaN
c      NaN
d      NaN
e    101.0
f     18.0
g    102.0
Name: c4, dtype: float64

---

In [66]:
df

Unnamed: 0,c1,c2,c3,c4,c5
a,0.0,1.0,2.0,20.0,
b,3.0,4.0,5.0,,
c,6.0,7.0,8.0,,
d,9.0,10.0,11.0,,
e,12.0,13.0,14.0,,
f,15.0,16.0,17.0,18.0,
g,,,,,


In [67]:
# 非常常見又方便的作法，每個欄位的遺失值都用那個欄位的平均值取代
# 如果是資料框，序列對應都是在看欄位
df.fillna(df.mean())

Unnamed: 0,c1,c2,c3,c4,c5
a,0.0,1.0,2.0,20.0,
b,3.0,4.0,5.0,19.0,
c,6.0,7.0,8.0,19.0,
d,9.0,10.0,11.0,19.0,
e,12.0,13.0,14.0,19.0,
f,15.0,16.0,17.0,18.0,
g,7.5,8.5,9.5,19.0,


### 內插求出遺漏值

In [68]:
s = pd.Series([1, np.nan, np.nan, np.nan, 2])
s

0    1.0
1    NaN
2    NaN
3    NaN
4    2.0
dtype: float64

In [69]:
s.interpolate

<bound method Series.interpolate of 0    1.0
1    NaN
2    NaN
3    NaN
4    2.0
dtype: float64>

In [70]:
# 預設是以位置做內插
# 如果你的資料代表漸增的值，例如溫度，這方法比補 0 好多了
# 位置從 0 到 4 走了 (4 - 0) = 4 個單位
# 值從 1 變化到 2
# 1 單位變化了 (2-1)/(4-0) = 0.25 個值
s.interpolate()  

0    1.00
1    1.25
2    1.50
3    1.75
4    2.00
dtype: float64

---

In [71]:
ts = pd.Series([1, np.nan, 2],
              index = [datetime(2014, 1, 1), datetime(2014, 2, 1), datetime(2014, 4, 1)])
ts

2014-01-01    1.0
2014-02-01    NaN
2014-04-01    2.0
dtype: float64

In [72]:
# 一單位變化了 (2 - 1)/(2 - 0) = 0.5 個值
# 預設是看位置
# 但如果考慮到日期索引，有更好的方式
ts.interpolate() 

2014-01-01    1.0
2014-02-01    1.5
2014-04-01    2.0
dtype: float64

In [73]:
# 方法改成看時間
# 所以一天變化了 (2 - 1)/(datetime(2014, 4, 1) - datetime(2014, 1, 1)) 個值
# 再看 1/1 到 2/1 過了幾天!
ts.interpolate(method = 'time') 

2014-01-01    1.000000
2014-02-01    1.344444
2014-04-01    2.000000
dtype: float64

---

In [74]:
s = pd.Series([0, np.nan, 100], index = [0, 1, 10])
s

0       0.0
1       NaN
10    100.0
dtype: float64

In [75]:
# 預設是以位置做內插
# 一單位變化了 (100 - 0)/(2 - 0) = 50 個值
s.interpolate() 

0       0.0
1      50.0
10    100.0
dtype: float64

In [76]:
# 按照索引值做內插 
# 所以一單位變化了 (100 - 0)/(10 - 0) = 10 個值
s.interpolate(method = 'values') 

0       0.0
1      10.0
10    100.0
dtype: float64

## 處理重複資料

In [77]:
data = pd.DataFrame({'a': ['x']*3 + ['y']*4,
                     'b': [1, 1, 2, 3, 3, 4, 4]})
data

Unnamed: 0,a,b
0,x,1
1,x,1
2,x,2
3,y,3
4,y,3
5,y,4
6,y,4


In [78]:
data.duplicated() # 由上往下看是否有重複列

0    False
1     True
2    False
3    False
4     True
5    False
6     True
dtype: bool

In [79]:
# 有 in-place 參數可供使用
# 預設 keep = 'first'
# 也可以用 data[~data.duplicated()]
data.drop_duplicates()  

Unnamed: 0,a,b
0,x,1
2,x,2
3,y,3
5,y,4


In [80]:
data.drop_duplicates(keep = 'last')

Unnamed: 0,a,b
1,x,1
2,x,2
4,y,3
6,y,4


---

In [81]:
data['c'] = range(7)
data

Unnamed: 0,a,b,c
0,x,1,0
1,x,1,1
2,x,2,2
3,y,3,3
4,y,3,4
5,y,4,5
6,y,4,6


In [82]:
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6    False
dtype: bool

In [83]:
data.drop_duplicates(['a', 'b']) # 只看 a 與 b 行來決定

Unnamed: 0,a,b,c
0,x,1,0
2,x,2,2
3,y,3,3
5,y,4,5


## 資料轉換

### 將資料映射成不同的值

In [84]:
x = pd.Series({'one': 1, 'two': 2, 'three': 3})
x

one      1
two      2
three    3
dtype: int64

In [85]:
y = pd.Series({1: 'a', 2: 'b', 3: 'c'})
y

1    a
2    b
3    c
dtype: object

In [86]:
# 順便想想 python 是如何操作 map() 函式
# 在 mod02 我們有討論過 # series.map() 作用在值上，沒有資料框方法
# 裡面放函數，然後作用在每個值上 # 如果放序列也是類似的
x.map(y)

one      a
two      b
three    c
dtype: object

---

In [87]:
x = pd.Series({'one': 1, 'two': 2, 'three':3}) 
y = pd.Series({1:'a', 2:'b'})
x

one      1
two      2
three    3
dtype: int64

In [88]:
y

1    a
2    b
dtype: object

In [89]:
x.map(y) # 沒有對應關係會出現 NaN

one        a
two        b
three    NaN
dtype: object

---

In [90]:
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon',
                              'Pastrami', 'corned beef', 'Bacon',
                              'pastrami', 'honey ham', 'nova lox'],
                     'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [91]:
meat_to_animal = {
  'bacon': 'pig',
  'pulled pork': 'pig',
  'pastrami': 'cow',
  'corned beef': 'cow',
  'honey ham': 'pig',
  'nova lox': 'salmon'
}
meat_to_animal

{'bacon': 'pig',
 'pulled pork': 'pig',
 'pastrami': 'cow',
 'corned beef': 'cow',
 'honey ham': 'pig',
 'nova lox': 'salmon'}

In [92]:
# 利用 .str 屬性會拿到字串物件，再利用字串的 lower() 方法
# 會一一對每個值操作 .lower() 方法
lowercased = data['food'].str.lower()
lowercased

0          bacon
1    pulled pork
2          bacon
3       pastrami
4    corned beef
5          bacon
6       pastrami
7      honey ham
8       nova lox
Name: food, dtype: object

In [93]:
data['animal'] = lowercased.map(meat_to_animal)
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [94]:
# 另一種寫法得到同樣結果 
# 也可以用 .apply() 方法
# df.apply() 方法作用在序列，series.apply() 方法作用在值上
data['food'].map(lambda x: meat_to_animal[x.lower()])

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

### 值的取代

In [95]:
s = pd.Series([0, 1, 2, 3, 2, 4], index = np.arange(2, 8))
s

2    0
3    1
4    2
5    3
6    2
7    4
dtype: int64

In [96]:
s.replace(2, 5) # 值是 2 的換成 5 # not in-place

2    0
3    1
4    5
5    3
6    5
7    4
dtype: int64

In [97]:
s

2    0
3    1
4    2
5    3
6    2
7    4
dtype: int64

---

In [98]:
s.replace([2, 3], np.nan) # 值是 2 或 3 的都換成遺失值

2    0.0
3    1.0
4    NaN
5    NaN
6    NaN
7    4.0
dtype: float64

In [99]:
s.replace([3, 4, 0], [7, 8, 1]) # 值是 3 的換成 7，值是 4 的換成 8，值是 0 的換成 1

2    1
3    1
4    2
5    7
6    2
7    8
dtype: int64

In [100]:
# 也可用字典表達
# 值 0 換成 100，值 3 換成 300
s.replace({0: 100, 3: 300}) 

2    100
3      1
4      2
5    300
6      2
7      4
dtype: int64

---

In [101]:
ss = s.copy()
ss[3] = np.nan
ss

2    0.0
3    NaN
4    2.0
5    3.0
6    2.0
7    4.0
dtype: float64

In [102]:
ss.replace(np.nan, 0, inplace = True)

In [103]:
ss

2    0.0
3    0.0
4    2.0
5    3.0
6    2.0
7    4.0
dtype: float64

---

In [104]:
df = pd.DataFrame({'a': [0, 1, 2, 3, 4], 
                   'b': [5, 6, 7, 1, 9]})
df

Unnamed: 0,a,b
0,0,5
1,1,6
2,2,7
3,3,1
4,4,9


In [105]:
# 資料框的 .replace() 方法，把值 1 的都換 10
df.replace(1, 10)

Unnamed: 0,a,b
0,0,5
1,10,6
2,2,7
3,3,10
4,4,9


In [106]:
# 資料框的 .replace() 方法，把欄位 a 值是 1 跟欄位 b 值是 9 的都換 10
df.replace({'a': 2, 'b': 9}, 10)

Unnamed: 0,a,b
0,0,5
1,1,6
2,10,7
3,3,1
4,4,10


---

In [107]:
s

2    0
3    1
4    2
5    3
6    2
7    4
dtype: int64

In [108]:
# 值是 0 或 4 的，用 ffill 方法填入
s.replace([0, 4], method = 'ffill')

2    0
3    1
4    2
5    3
6    2
7    2
dtype: int64

In [109]:
# 值是 0 或 4 的，用 bfill 方法填入
s.replace([0, 4], method = 'bfill')

2    1
3    1
4    2
5    3
6    2
7    4
dtype: int64

### 更名軸 index

In [110]:
data = pd.DataFrame(np.arange(12).reshape((3, 4)),
                    index = ['Ohio', 'Colorado', 'New York'],
                    columns = ['one', 'two', 'three', 'four'])
data2 = data.copy()
data2

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [111]:
data2.columns = ['col1', 'col2', 'col3', 'col4']
data2.index = ['a', 'b', 'c']
data2

Unnamed: 0,col1,col2,col3,col4
a,0,1,2,3
b,4,5,6,7
c,8,9,10,11


---

In [112]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [113]:
transform = lambda x: x[:4].upper()

In [114]:
data.index.map(transform)

Index(['OHIO', 'COLO', 'NEW '], dtype='object')

In [115]:
data.index = data.index.map(transform)

In [116]:
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


---

In [117]:
data.rename(index = str.title, columns = str.upper) # index 與 columns 參數也可以接函式或方法

Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colo,4,5,6,7
New,8,9,10,11


In [118]:
# 用 .rename() 方法可以只單獨改某幾個索引標籤
data.rename(index = {'OHIO': 'INDIANA'},
            columns = {'three': 'peekaboo'}) # 參數可以接字典

Unnamed: 0,one,two,peekaboo,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [119]:
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [120]:
data.rename(index = {'OHIO': 'INDIANA'},
            columns = {'three': 'peekaboo'}, inplace = True)
data

Unnamed: 0,one,two,peekaboo,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


### 離散化和分組

In [121]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
ages

[20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

In [122]:
bins = [18, 25, 35, 60, 100]
cats = pd.cut(ages, bins)
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [123]:
cats.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]], dtype='interval[int64, right]')

In [124]:
cats.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [125]:
cats.value_counts()

(18, 25]     5
(25, 35]     3
(35, 60]     3
(60, 100]    1
dtype: int64

In [126]:
cats.describe() 

Unnamed: 0_level_0,counts,freqs
categories,Unnamed: 1_level_1,Unnamed: 2_level_1
"(18, 25]",5,0.416667
"(25, 35]",3,0.25
"(35, 60]",3,0.25
"(60, 100]",1,0.083333


---

In [127]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
df = pd.DataFrame(ages, columns = ['age'] )
df

Unnamed: 0,age
0,20
1,22
2,25
3,27
4,21
5,23
6,37
7,31
8,61
9,45


In [128]:
bins = [18, 25, 35, 60, 100]
labels = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']
pd.cut(df['age'], bins = bins, labels = labels )

0          Youth
1          Youth
2          Youth
3     YoungAdult
4          Youth
5          Youth
6     MiddleAged
7     YoungAdult
8         Senior
9     MiddleAged
10    MiddleAged
11    YoungAdult
Name: age, dtype: category
Categories (4, object): ['Youth' < 'YoungAdult' < 'MiddleAged' < 'Senior']

In [129]:
df['age_cat'] = pd.cut(df['age'], bins = bins, labels = labels )
df

Unnamed: 0,age,age_cat
0,20,Youth
1,22,Youth
2,25,Youth
3,27,YoungAdult
4,21,Youth
5,23,Youth
6,37,MiddleAged
7,31,YoungAdult
8,61,Senior
9,45,MiddleAged


In [130]:
df['age_cat'].describe()

count        12
unique        4
top       Youth
freq          5
Name: age_cat, dtype: object

In [131]:
df['age_cat'].value_counts()  # 更多內容請參考類別資料的章節

Youth         5
YoungAdult    3
MiddleAged    3
Senior        1
Name: age_cat, dtype: int64

In [132]:
df.sort_values(by = "age_cat")

Unnamed: 0,age,age_cat
0,20,Youth
1,22,Youth
2,25,Youth
4,21,Youth
5,23,Youth
3,27,YoungAdult
7,31,YoungAdult
11,32,YoungAdult
6,37,MiddleAged
9,45,MiddleAged


### 偵測和濾除離群值

In [133]:
np.random.seed(123456)
data = pd.DataFrame(np.random.randn(1000, 4)) # 母體平均值是 0，標準差是 1
data[:10]

Unnamed: 0,0,1,2,3
0,0.469112,-0.282863,-1.509059,-1.135632
1,1.212112,-0.173215,0.119209,-1.044236
2,-0.861849,-2.104569,-0.494929,1.071804
3,0.721555,-0.706771,-1.039575,0.27186
4,-0.424972,0.56702,0.276232,-1.087401
5,-0.67369,0.113648,-1.478427,0.524988
6,0.404705,0.577046,-1.715002,-1.039268
7,-0.370647,-1.157892,-1.344312,0.844885
8,1.07577,-0.10905,1.643563,-1.469388
9,0.357021,-0.6746,-1.776904,-0.968914


In [134]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.035552,0.051018,-0.021633,0.007192
std,1.035059,0.975583,1.027132,1.031519
min,-3.303099,-2.929552,-3.520876,-3.192716
25%,-0.73835,-0.574433,-0.730783,-0.712537
50%,-0.05284,0.064222,-0.015013,0.020372
75%,0.718834,0.711438,0.682998,0.71732
max,3.004229,3.501927,3.357427,3.565769


In [135]:
data_cp = data.copy()
data_cp

Unnamed: 0,0,1,2,3
0,0.469112,-0.282863,-1.509059,-1.135632
1,1.212112,-0.173215,0.119209,-1.044236
2,-0.861849,-2.104569,-0.494929,1.071804
3,0.721555,-0.706771,-1.039575,0.271860
4,-0.424972,0.567020,0.276232,-1.087401
...,...,...,...,...
995,0.979256,-1.180839,-0.544735,-0.714758
996,-0.701966,-1.233189,1.316605,0.313410
997,-0.591876,1.226774,0.227639,1.709756
998,-0.061853,1.093207,-0.179307,-1.672583


In [136]:
# 找出第二欄位減去平均值的絕對值大於 3 個標準差的就是離群值
mask = np.abs(data_cp[2] - data_cp[2].mean()) > 3*data_cp[2].std() 
data_cp[2][mask]  

26     3.357427
63    -3.201750
112   -3.520876
474    3.160915
492    3.124365
966   -3.136046
Name: 2, dtype: float64

In [137]:
# 一種作法去除離群值
data_cp[~mask]

Unnamed: 0,0,1,2,3
0,0.469112,-0.282863,-1.509059,-1.135632
1,1.212112,-0.173215,0.119209,-1.044236
2,-0.861849,-2.104569,-0.494929,1.071804
3,0.721555,-0.706771,-1.039575,0.271860
4,-0.424972,0.567020,0.276232,-1.087401
...,...,...,...,...
995,0.979256,-1.180839,-0.544735,-0.714758
996,-0.701966,-1.233189,1.316605,0.313410
997,-0.591876,1.226774,0.227639,1.709756
998,-0.061853,1.093207,-0.179307,-1.672583


---

In [138]:
data_cp

Unnamed: 0,0,1,2,3
0,0.469112,-0.282863,-1.509059,-1.135632
1,1.212112,-0.173215,0.119209,-1.044236
2,-0.861849,-2.104569,-0.494929,1.071804
3,0.721555,-0.706771,-1.039575,0.271860
4,-0.424972,0.567020,0.276232,-1.087401
...,...,...,...,...
995,0.979256,-1.180839,-0.544735,-0.714758
996,-0.701966,-1.233189,1.316605,0.313410
997,-0.591876,1.226774,0.227639,1.709756
998,-0.061853,1.093207,-0.179307,-1.672583


In [139]:
# 找出那些列包含大於 3 個標準差的值
# 一種做法是把這些有包含離群值的列都刪除
for i in data_cp.columns:
    mask = np.abs(data_cp[i] - data_cp[i].mean()) > 3*data_cp[i].std()
    data_cp = data_cp[~mask]

In [140]:
data_cp

Unnamed: 0,0,1,2,3
0,0.469112,-0.282863,-1.509059,-1.135632
1,1.212112,-0.173215,0.119209,-1.044236
2,-0.861849,-2.104569,-0.494929,1.071804
3,0.721555,-0.706771,-1.039575,0.271860
4,-0.424972,0.567020,0.276232,-1.087401
...,...,...,...,...
995,0.979256,-1.180839,-0.544735,-0.714758
996,-0.701966,-1.233189,1.316605,0.313410
997,-0.591876,1.226774,0.227639,1.709756
998,-0.061853,1.093207,-0.179307,-1.672583


In [141]:
# 我們把有包含超過 3 個標準差的值的列都刪掉了
# 濾除離群值，通常利於之後的分析
data_cp.describe()

Unnamed: 0,0,1,2,3
count,987.0,987.0,987.0,987.0
mean,-0.034916,0.054759,-0.020308,0.004986
std,1.028494,0.964179,0.997902,1.020149
min,-2.666646,-2.833091,-2.979019,-3.101335
25%,-0.736885,-0.571525,-0.729037,-0.714776
50%,-0.05039,0.065624,-0.014805,0.012349
75%,0.710786,0.71189,0.674067,0.712745
max,3.004229,2.858021,2.918333,2.918398


In [142]:
# 跟沒有刪除離群值的統計量比較一下
# 刪除離群值的平均值跟標準差估的比較準一點
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.035552,0.051018,-0.021633,0.007192
std,1.035059,0.975583,1.027132,1.031519
min,-3.303099,-2.929552,-3.520876,-3.192716
25%,-0.73835,-0.574433,-0.730783,-0.712537
50%,-0.05284,0.064222,-0.015013,0.020372
75%,0.718834,0.711438,0.682998,0.71732
max,3.004229,3.501927,3.357427,3.565769


### 排列與隨機取樣

In [143]:
df = pd.DataFrame(np.arange(20).reshape(5, 4), index = list('abced'))
df

Unnamed: 0,0,1,2,3
a,0,1,2,3
b,4,5,6,7
c,8,9,10,11
e,12,13,14,15
d,16,17,18,19


In [144]:
# 把 0, 1, 2, 3, 4 隨機排列，也就是洗牌的意思
samper = np.random.permutation(len(df))
samper

array([3, 1, 2, 0, 4])

In [145]:
df.iloc[samper]

Unnamed: 0,0,1,2,3
e,12,13,14,15
b,4,5,6,7
c,8,9,10,11
a,0,1,2,3
d,16,17,18,19


In [146]:
# 跟上面一樣結果
# 回憶一下之前學過的 series.take()
# Return the elements in the given *positional* indices along an axis.
df.take(samper)

Unnamed: 0,0,1,2,3
e,12,13,14,15
b,4,5,6,7
c,8,9,10,11
a,0,1,2,3
d,16,17,18,19


---

In [147]:
# 隨機選出三列，預設 replace = False，也就是取後不放回
df.sample(n = 3)

Unnamed: 0,0,1,2,3
e,12,13,14,15
c,8,9,10,11
a,0,1,2,3


In [148]:
# 預設 replace = False，也就是取後不放回
# 就跟剛剛用 permutation 的效果是一樣
df.sample(n = len(df))

Unnamed: 0,0,1,2,3
e,12,13,14,15
d,16,17,18,19
c,8,9,10,11
b,4,5,6,7
a,0,1,2,3


In [149]:
# 取後放回
# 可參考我們 mod03，之前有稍微提過 .sample() 方法，複習一下
df.sample(n = 10, replace = True)

Unnamed: 0,0,1,2,3
d,16,17,18,19
a,0,1,2,3
c,8,9,10,11
e,12,13,14,15
c,8,9,10,11
c,8,9,10,11
d,16,17,18,19
c,8,9,10,11
b,4,5,6,7
d,16,17,18,19


### 指標 (indicator) 與虛擬變數 (dummy)

In [150]:
df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                   'data1': range(6)})
df

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [151]:
df['key']

0    b
1    b
2    a
3    c
4    a
5    b
Name: key, dtype: object

In [152]:
dummies = pd.get_dummies(df['key'], prefix = 'key') # 在欄位加上前綴
dummies

Unnamed: 0,key_a,key_b,key_c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [153]:
# 注意 df[['data1']] 的寫法，出來會是一個資料框
# 因為只有資料框才有 .join() 方法，不然用 pd.concat() 函式也可以
# 資料框的 .join() 方法之後有章節會教到
df_with_dummy = df[['data1']].join(dummies) 
df_with_dummy

Unnamed: 0,data1,key_a,key_b,key_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


In [154]:
pd.concat([df[['data1']], dummies], axis = 1)

Unnamed: 0,data1,key_a,key_b,key_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


---

In [155]:
mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table('./mod06/movies.dat', sep='::',
                       header = None, names = mnames, encoding = "ISO-8859-1", engine = 'python') # 注意編碼跟引擎
movies

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


In [156]:
# 也可以用 movies.geanres.str.split("|")，再搭配迴圈
all_genres = []
for i in movies.genres:
    all_genres.extend(i.split('|'))
all_genres

['Animation',
 "Children's",
 'Comedy',
 'Adventure',
 "Children's",
 'Fantasy',
 'Comedy',
 'Romance',
 'Comedy',
 'Drama',
 'Comedy',
 'Action',
 'Crime',
 'Thriller',
 'Comedy',
 'Romance',
 'Adventure',
 "Children's",
 'Action',
 'Action',
 'Adventure',
 'Thriller',
 'Comedy',
 'Drama',
 'Romance',
 'Comedy',
 'Horror',
 'Animation',
 "Children's",
 'Drama',
 'Action',
 'Adventure',
 'Romance',
 'Drama',
 'Thriller',
 'Drama',
 'Romance',
 'Thriller',
 'Comedy',
 'Action',
 'Action',
 'Comedy',
 'Drama',
 'Crime',
 'Drama',
 'Thriller',
 'Thriller',
 'Drama',
 'Sci-Fi',
 'Drama',
 'Romance',
 'Drama',
 'Drama',
 'Romance',
 'Adventure',
 'Sci-Fi',
 'Drama',
 'Drama',
 'Drama',
 'Sci-Fi',
 'Adventure',
 'Romance',
 "Children's",
 'Comedy',
 'Drama',
 'Drama',
 'Romance',
 'Drama',
 'Documentary',
 'Comedy',
 'Comedy',
 'Romance',
 'Drama',
 'Drama',
 'War',
 'Action',
 'Crime',
 'Drama',
 'Drama',
 'Action',
 'Adventure',
 'Comedy',
 'Drama',
 'Drama',
 'Romance',
 'Crime',
 'Thrill

In [157]:
all_genres = []
for i in movies.genres:
    all_genres.extend(i.split('|'))
all_genres

['Animation',
 "Children's",
 'Comedy',
 'Adventure',
 "Children's",
 'Fantasy',
 'Comedy',
 'Romance',
 'Comedy',
 'Drama',
 'Comedy',
 'Action',
 'Crime',
 'Thriller',
 'Comedy',
 'Romance',
 'Adventure',
 "Children's",
 'Action',
 'Action',
 'Adventure',
 'Thriller',
 'Comedy',
 'Drama',
 'Romance',
 'Comedy',
 'Horror',
 'Animation',
 "Children's",
 'Drama',
 'Action',
 'Adventure',
 'Romance',
 'Drama',
 'Thriller',
 'Drama',
 'Romance',
 'Thriller',
 'Comedy',
 'Action',
 'Action',
 'Comedy',
 'Drama',
 'Crime',
 'Drama',
 'Thriller',
 'Thriller',
 'Drama',
 'Sci-Fi',
 'Drama',
 'Romance',
 'Drama',
 'Drama',
 'Romance',
 'Adventure',
 'Sci-Fi',
 'Drama',
 'Drama',
 'Drama',
 'Sci-Fi',
 'Adventure',
 'Romance',
 "Children's",
 'Comedy',
 'Drama',
 'Drama',
 'Romance',
 'Drama',
 'Documentary',
 'Comedy',
 'Comedy',
 'Romance',
 'Drama',
 'Drama',
 'War',
 'Action',
 'Crime',
 'Drama',
 'Drama',
 'Action',
 'Adventure',
 'Comedy',
 'Drama',
 'Drama',
 'Romance',
 'Crime',
 'Thrill

In [158]:
# 用 pd.unique() 函式找出到底有哪些不重複的電影類型
genres = pd.unique(all_genres)
genres

array(['Animation', "Children's", 'Comedy', 'Adventure', 'Fantasy',
       'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror',
       'Sci-Fi', 'Documentary', 'War', 'Musical', 'Mystery', 'Film-Noir',
       'Western'], dtype=object)

In [159]:
# 準備來建立虛擬變數
zero_matrix = np.zeros((len(movies), len(genres)))
dummies = pd.DataFrame(zero_matrix, columns = genres)
dummies

Unnamed: 0,Animation,Children's,Comedy,Adventure,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3879,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3880,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3881,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [160]:
gen = movies.genres[0].split('|')
gen

['Animation', "Children's", 'Comedy']

In [161]:
# index 物件有 .get_indexer() 方法，給索引標籤會返回位置
dummies.columns.get_indexer(gen)

array([0, 1, 2], dtype=int64)

In [162]:
# 其實也可以不用轉成位置，直接用索引標籤操作
for i, gen in enumerate(movies.genres):
    indices =  dummies.columns.get_indexer(gen.split('|'))
    dummies.iloc[i][indices] = 1

In [163]:
dummies

Unnamed: 0,Animation,Children's,Comedy,Adventure,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3879,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3880,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3881,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [164]:
# 也可以用 pd.concat([movies, dummies.add_prefix('Genre')], axis = 1)
movies_windic = movies.join(dummies.add_prefix('Genre')) # 資料框有 add.prefix() 方法，幫你把欄位名加前綴字
movies_windic

Unnamed: 0,movie_id,title,genres,GenreAnimation,GenreChildren's,GenreComedy,GenreAdventure,GenreFantasy,GenreRomance,GenreDrama,...,GenreCrime,GenreThriller,GenreHorror,GenreSci-Fi,GenreDocumentary,GenreWar,GenreMusical,GenreMystery,GenreFilm-Noir,GenreWestern
0,1,Toy Story (1995),Animation|Children's|Comedy,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji (1995),Adventure|Children's|Fantasy,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men (1995),Comedy|Romance,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale (1995),Comedy|Drama,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II (1995),Comedy,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,3948,Meet the Parents (2000),Comedy,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3879,3949,Requiem for a Dream (2000),Drama,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3880,3950,Tigerland (2000),Drama,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3881,3951,Two Family House (2000),Drama,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [165]:
movies_windic.iloc[0]

movie_id                                      1
title                          Toy Story (1995)
genres              Animation|Children's|Comedy
GenreAnimation                              1.0
GenreChildren's                             1.0
GenreComedy                                 1.0
GenreAdventure                              0.0
GenreFantasy                                0.0
GenreRomance                                0.0
GenreDrama                                  0.0
GenreAction                                 0.0
GenreCrime                                  0.0
GenreThriller                               0.0
GenreHorror                                 0.0
GenreSci-Fi                                 0.0
GenreDocumentary                            0.0
GenreWar                                    0.0
GenreMusical                                0.0
GenreMystery                                0.0
GenreFilm-Noir                              0.0
GenreWestern                            

---

In [166]:
np.random.seed(12345)
values = np.random.rand(10)
values

array([0.92961609, 0.31637555, 0.18391881, 0.20456028, 0.56772503,
       0.5955447 , 0.96451452, 0.6531771 , 0.74890664, 0.65356987])

In [167]:
bins = [0, 0.2, 0.4, 0.6, 0.8, 1]

In [168]:
# 這是類別物件
cat = pd.cut(values, bins = bins)
cat

[(0.8, 1.0], (0.2, 0.4], (0.0, 0.2], (0.2, 0.4], (0.4, 0.6], (0.4, 0.6], (0.8, 1.0], (0.6, 0.8], (0.6, 0.8], (0.6, 0.8]]
Categories (5, interval[float64, right]): [(0.0, 0.2] < (0.2, 0.4] < (0.4, 0.6] < (0.6, 0.8] < (0.8, 1.0]]

In [169]:
cat.value_counts()

(0.0, 0.2]    1
(0.2, 0.4]    2
(0.4, 0.6]    2
(0.6, 0.8]    3
(0.8, 1.0]    2
dtype: int64

In [170]:
cat.describe()

Unnamed: 0_level_0,counts,freqs
categories,Unnamed: 1_level_1,Unnamed: 2_level_1
"(0.0, 0.2]",1,0.1
"(0.2, 0.4]",2,0.2
"(0.4, 0.6]",2,0.2
"(0.6, 0.8]",3,0.3
"(0.8, 1.0]",2,0.2


In [171]:
# pd.get_dummies 函式放入類別物件的效果
pd.get_dummies(cat)

Unnamed: 0,"(0.0, 0.2]","(0.2, 0.4]","(0.4, 0.6]","(0.6, 0.8]","(0.8, 1.0]"
0,0,0,0,0,1
1,0,1,0,0,0
2,1,0,0,0,0
3,0,1,0,0,0
4,0,0,1,0,0
5,0,0,1,0,0
6,0,0,0,0,1
7,0,0,0,1,0
8,0,0,0,1,0
9,0,0,0,1,0


### 套用函數來轉換資料

In [172]:
# 在 mod02 已經有學過，可以複習一下
s = pd.Series(np.arange(0, 5))
s

0    0
1    1
2    2
3    3
4    4
dtype: int32

In [173]:
# df.apply() 方法作用在序列，series.apply() 方法作用在值上
# 這裡用 map() 方法效果會一樣
s.apply(lambda x: x*2)

0    0
1    2
2    4
3    6
4    8
dtype: int64

---

In [174]:
df = pd.DataFrame(np.arange(12).reshape(4, 3),
                 columns = ['a', 'b', 'c'])
df

Unnamed: 0,a,b,c
0,0,1,2
1,3,4,5
2,6,7,8
3,9,10,11


In [175]:
df.apply(lambda x: x.sum()) # 預設 axis = 0

a    18
b    22
c    26
dtype: int64

In [176]:
df.apply(lambda x: x.sum(), axis = 1)

0     3
1    12
2    21
3    30
dtype: int64

In [177]:
# 注意這裡的軸容易被誤導
# axis = 0 or ‘index’: apply function to each column.
# axis = 1 or ‘columns’: apply function to each row.
df['interim'] = df.apply(lambda x: x['a']*x['b'], axis = 1)
df

Unnamed: 0,a,b,c,interim
0,0,1,2,0
1,3,4,5,12
2,6,7,8,42
3,9,10,11,90


In [178]:
df['result'] = df.apply(lambda x: x['c'] + x['interim'], axis = 1)
df

Unnamed: 0,a,b,c,interim,result
0,0,1,2,0,2
1,3,4,5,12,17
2,6,7,8,42,50
3,9,10,11,90,101


---

In [179]:
df = pd.DataFrame(np.arange(0, 15).reshape(3, 5))
df.loc[1, 2] = np.nan
df

Unnamed: 0,0,1,2,3,4
0,0,1,2.0,3,4
1,5,6,,8,9
2,10,11,12.0,13,14


In [180]:
df.dropna().apply(lambda x: x.sum(), axis = 1)

0    10.0
2    60.0
dtype: float64

---

In [181]:
df

Unnamed: 0,0,1,2,3,4
0,0,1,2.0,3,4
1,5,6,,8,9
2,10,11,12.0,13,14


In [182]:
df.applymap(lambda x: '%.2f' % x)

Unnamed: 0,0,1,2,3,4
0,0.0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0


In [183]:
# 跟上面一樣是字串格式化，也可以用 f-string
df.applymap(lambda x: '{:.2f}'.format(x))

Unnamed: 0,0,1,2,3,4
0,0.0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0


## 字串操作

### 字串物件的方法

In [184]:
val = 'a, b, guido'
val.split(',')

['a', ' b', ' guido']

In [185]:
pieces = [i.strip(' ') for i in val.split(',')]
pieces

['a', 'b', 'guido']

In [186]:
# 可能會想把他們用符號 :: 連起來
first, second, third = pieces
first + '::' + second + '::' + third

'a::b::guido'

In [187]:
'::'.join(pieces) # 實務上比較常用的做法

'a::b::guido'

---

In [188]:
# 成員運算子
'guido' in val

True

In [189]:
# 字串的 .index 方法
val.index(',')

1

In [190]:
val.find(':') # -1 表示找不掉

-1

In [191]:
val.index(':') # .find() 方法跟 .index() 方法的差別

ValueError: substring not found

In [192]:
val.count(',')

2

In [193]:
val.replace(',', '::')

'a:: b:: guido'

In [194]:
val.replace(',', '')

'a b guido'

### 正規表達式

In [195]:
text = "foo    bar\t baz  \tqux"
text

'foo    bar\t baz  \tqux'

In [196]:
# 滿足正規表達式的切割符號，會返回列表
re.split(r'\s+', text)

['foo', 'bar', 'baz', 'qux']

---

In [197]:
regex = re.compile(r'\s+') # 手動呼叫 re.compile() 來編譯一個 regex，這樣可以有重複可用的 regex 物件

In [198]:
regex.split(text)

['foo', 'bar', 'baz', 'qux']

In [199]:
regex.findall(text)

['    ', '\t ', '  \t']

---

In [200]:
text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""

In [201]:
# 有加 flags = re.IGNORECASE，我們就可以不用再加入小寫
# 另一種寫法: re.findall(pattern, text, flags=re.IGNORECASE)
pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'
regex = re.compile(pattern, flags = re.IGNORECASE)
regex.findall(text)

['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']

In [202]:
m = regex.search(text)
m

<re.Match object; span=(5, 20), match='dave@google.com'>

In [203]:
m = regex.match(text)
m

In [204]:
# redacted 的意思是為了模糊或刪除敏感信息而做出特殊編輯的
print(regex.sub('REDACTED', text))

Dave REDACTED
Steve REDACTED
Rob REDACTED
Ryan REDACTED



In [205]:
text

'Dave dave@google.com\nSteve steve@gmail.com\nRob rob@gmail.com\nRyan ryan@yahoo.com\n'

In [206]:
# 不要忘記 flags= re.IGNORECASE，不然就一開始就把大小寫都加進 pattern 中
print(re.sub(pattern, 'REDACTED', text, flags= re.IGNORECASE))

Dave REDACTED
Steve REDACTED
Rob REDACTED
Ryan REDACTED



---

In [207]:
# 用括號 () 來分組
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
regex = re.compile(pattern, flags=re.IGNORECASE)

In [208]:
m = regex.match('wesm@bright.net')
m.groups()

('wesm', 'bright', 'net')

In [209]:
# group() 與 group(0) 傳回完整的搜尋字串
m.group(0)

'wesm@bright.net'

In [210]:
m.group(1)

'wesm'

In [211]:
m.group(2)

'bright'

In [212]:
m.group(3)

'net'

In [213]:
regex.findall(text)

[('dave', 'google', 'com'),
 ('steve', 'gmail', 'com'),
 ('rob', 'gmail', 'com'),
 ('ryan', 'yahoo', 'com')]

In [214]:
print(regex.sub(r'Username: \1, Domain: \2, Suffix: \3', text)) # \1、\2、\3 這樣特別的符號是來存取分段的結果

Dave Username: dave, Domain: google, Suffix: com
Steve Username: steve, Domain: gmail, Suffix: com
Rob Username: rob, Domain: gmail, Suffix: com
Ryan Username: ryan, Domain: yahoo, Suffix: com



### pandas 中向量字串函式

In [215]:
data = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com',
        'Rob': 'rob@gmail.com', 'Wes': np.nan}
data = pd.Series(data)
data

Dave     dave@google.com
Steve    steve@gmail.com
Rob        rob@gmail.com
Wes                  NaN
dtype: object

In [216]:
data.isnull()

Dave     False
Steve    False
Rob      False
Wes       True
dtype: bool

In [217]:
# 遇到遺失值，直接用 .map() 或 .apply() 方法等都會產生例外喔!
# 所以要加個條件式過濾
# 不能寫 x == np.nan ，過濾遺失值會失敗
def func(x):
    if x is np.nan:
        return x
    else:
        pattern = r'gmail'
        m = re.search(pattern, x)
        if m:
            return True
        else:
            return False

data.map(func)

Dave     False
Steve     True
Rob       True
Wes        NaN
dtype: object

In [218]:
data.str.contains('gmail')

Dave     False
Steve     True
Rob       True
Wes        NaN
dtype: object

In [219]:
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
data.str.findall(pattern, flags = re.IGNORECASE)

Dave     [(dave, google, com)]
Steve    [(steve, gmail, com)]
Rob        [(rob, gmail, com)]
Wes                        NaN
dtype: object

In [220]:
# 利用可以分組的正規表達式，從字串 Series 中取出一個或多個字串，輸出結果是一個資料框，每個分組會有一欄
# 一定要分組，也就是給括號
matches = data.str.extract(pattern, flags = re.IGNORECASE)
matches

Unnamed: 0,0,1,2
Dave,dave,google,com
Steve,steve,gmail,com
Rob,rob,gmail,com
Wes,,,


In [221]:
matches[0]

Dave      dave
Steve    steve
Rob        rob
Wes        NaN
Name: 0, dtype: object

In [222]:
matches.iloc[:, 1]

Dave     google
Steve     gmail
Rob       gmail
Wes         NaN
Name: 1, dtype: object

---

In [223]:
# 拿到字串物件後會對裡面每個字串做切片
data.str[:5]

Dave     dave@
Steve    steve
Rob      rob@g
Wes        NaN
dtype: object

---

In [224]:
s = pd.Series(['**Crystal;L123  ', ' Matt;L456 ', 'Alice;L789 '])
s

0    **Crystal;L123  
1          Matt;L456 
2         Alice;L789 
dtype: object

In [225]:
s = s.str.strip('*').str.strip(' ')
s

0    Crystal;L123
1       Matt;L456
2      Alice;L789
dtype: object

In [226]:
s = s.str.split(';')
s

0    [Crystal, L123]
1       [Matt, L456]
2      [Alice, L789]
dtype: object

In [227]:
df = pd.DataFrame(np.zeros((len(s), 2)))
df

Unnamed: 0,0,1
0,0.0,0.0
1,0.0,0.0
2,0.0,0.0


In [228]:
df.iloc[0] = s[0]
df

Unnamed: 0,0,1
0,Crystal,L123
1,0.0,0.0
2,0.0,0.0


In [229]:
for i in range(len(s)):
    df.iloc[i] = s[i]
df

Unnamed: 0,0,1
0,Crystal,L123
1,Matt,L456
2,Alice,L789


---

In [230]:
s = pd.Series(['**Crystal;L123  ', ' Matt;L456 ', 'Alice;L789 '])
s

0    **Crystal;L123  
1          Matt;L456 
2         Alice;L789 
dtype: object

In [231]:
pattern = r"([A-Za-z]+);([A-Z0-9]+)"
s.str.extract(pattern)

Unnamed: 0,0,1
0,Crystal,L123
1,Matt,L456
2,Alice,L789


## 綜合應用

In [232]:
string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [233]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [234]:
string_data[0] = None

In [235]:
string_data

0         None
1    artichoke
2          NaN
3      avocado
dtype: object

In [236]:
string_data.isnull() # None 會被當成 NaN

0     True
1    False
2     True
3    False
dtype: bool