#  <font color=red> Module_06_資料整理與前處理</font>

## 如何處理資料遺漏

### 先建立一個有遺漏值的資料框

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
import re

df = pd.DataFrame(np.arange(0, 15).reshape(5, 3),
                 index = ['a', 'b', 'c', 'd', 'e'],
                 columns = ['c1', 'c2', 'c3'])
df

In [None]:
# pandas 使用了浮點數值 NaN (Not a Number) 來代表遺失資料，我們稱這個易於辨識的值為標記值
df['c4'] = np.nan

In [None]:
df.loc['f'] = np.arange(15, 19)

In [None]:
df

In [None]:
df.loc['a']['c4']

In [None]:
type(df.loc['a']['c4'])

---

In [None]:
df.loc['g'] = np.nan
df['c5'] = np.nan

In [None]:
df

In [None]:
df['c4']['a'] = 20

In [None]:
df

### 判定 pandas 物件裡的 NaN 值

In [None]:
df

In [None]:
# 返回還是資料框
df.isnull()

In [None]:
# 注意在 python 中，NaN代表遺失值，它是 float 資料型態，重點是 np.nan != np.nan
# 但是 None = None
# 下面的方法會得到不是我們要的結果
df == np.nan 

In [None]:
# 每個欄位有幾個遺失值
df.isnull().sum()

In [None]:
df.isnull().sum(axis = 1)

In [None]:
# 資料框內總共有幾個遺失值
df.isnull().sum().sum() 

---

In [None]:
# 資料框的 .count() 方法是計算非 NaN 的個數，預設 axis = 0
df.count() 

In [None]:
# 也可以得到資料框內總共有幾個遺失值
(len(df) - df.count()).sum() 

---

In [None]:
# 反過來問，還是返回資料框
df.notnull() 

In [None]:
df.notnull().sum().sum()

In [None]:
df.size

### 排除遺漏的資料項

In [None]:
df

In [None]:
# 看看 c4 行的非 NaN 的值
# 布林選擇
df.c4[df.c4.notnull()] 

In [None]:
# 使用序列的 .dropna() 得到同樣的效果
 # not in-place
df.c4.dropna()

In [None]:
df.c4

In [None]:
# 資料框的 .dropna() 方法
# 預設 axis = 0 and how = 'any'，也就是只要有出 NaN 就把那一列刪了 # not in-place
df.dropna() 

In [None]:
# 要整列都是 NaN 才會刪掉
df.dropna(how = 'all')

In [None]:
# 先把整列都是 NaN 的列刪了，再把欄只要有  NaN 的也都刪了!
df.dropna(how = 'all').dropna(axis = 1, how = 'any')

In [None]:
df

In [None]:
df.dropna(axis = 1, how = 'all')

---

In [None]:
df2 = df.copy()
df2

In [None]:
df2.loc['g']['c1'] = 0
df2.loc['g']['c3'] = 0

In [None]:
df2

In [None]:
df2.dropna(axis = 1, how = 'any', inplace = True)

In [None]:
df2

---

In [None]:
df

In [None]:
# thresh 參數就是 threshold ，臨界點的意思
# 至少三個值才不會被捨棄
df.dropna(thresh = 3, axis = 1)

### 在數學運算中處理 NaN 值

In [None]:
a = np.array([1, 2, np.nan, 3])
b = pd.Series(a)
a

In [None]:
b

In [None]:
a.mean() # Numpy 函式與方法遇到 NaN 通常會回傳 NaN

In [None]:
b.mean() # pandas 函式與方法通常會忽略 NaN # 所以答案是 (1+2+3)/3 = 2

---

In [None]:
s = df.c4
s

In [None]:
s.sum()

In [None]:
s.mean()

In [None]:
# 就算是累積總和的方法 .cumsum() 也是會忽略遺失值
s.cumsum()

In [None]:
df.c4 + 1 # 但如果是傳統的數學運算子，NaN 會傳遞到最終的結果

### 填入遺漏的資料

In [None]:
df

In [None]:
# 只要是遺失值的都填入 0
# not in-place
filled = df.fillna(0) 
filled

In [None]:
df

In [None]:
df.mean() 

In [None]:
# 注意!! 填完遺失值的數學統計量跟未填入算出來會產生不同的結果
# 遺失值如果沒有被填入，在操作 pandas 的方法時通常會被忽略
filled.mean()

---

In [None]:
df2 = df.copy()
df2

In [None]:
# 可以給個字典，給出哪個欄位的遺失值要填入哪個值
# not in-place
df2.fillna({'c2': 0, 'c4': 100})

In [None]:
df2

In [None]:
df2.fillna({'c2': 0, 'c4': 100}, inplace = True)
df2

### 以向前及向後方式填充遺漏值

In [None]:
df.c4

In [None]:
# 在處理時間序列時常用的方法
df.c4.fillna(method = 'ffill') 

In [None]:
 # 也有 pd.ffill() 與 pd.bfill() 全域函式可以使用
df.c4.fillna(method = 'bfill')

---

In [None]:
df

In [None]:
# 在資料框上也是類似的效果
df.fillna(method = 'ffill')

In [None]:
# 參數 limit 可以限制最多填入幾次遺失值
df.fillna(method = 'ffill', axis = 1, limit = 1)

### 利用索引標籤填值

In [None]:
df

In [None]:
fill_valued = pd.Series([100, 101, 102], index = ['a', 'e', 'g'])
fill_valued

In [None]:
df.c4

In [None]:
# 只有 NaN 的項目才能被擴充取代，注意標籤 a 的值沒變
# 裡面的 fill_valued 也可以是字典
df.c4.fillna(fill_valued) 

---

In [None]:
df

In [None]:
# 非常常見又方便的作法，每個欄位的遺失值都用那個欄位的平均值取代
# 如果是資料框，序列對應都是在看欄位
df.fillna(df.mean())

### 內插求出遺漏值

In [None]:
s = pd.Series([1, np.nan, np.nan, np.nan, 2])
s

In [None]:
s.interpolate

In [None]:
# 預設是以位置做內插
# 如果你的資料代表漸增的值，例如溫度，這方法比補 0 好多了
# 位置從 0 到 4 走了 (4 - 0) = 4 個單位
# 值從 1 變化到 2
# 1 單位變化了 (2-1)/(4-0) = 0.25 個值
s.interpolate()  

---

In [None]:
ts = pd.Series([1, np.nan, 2],
              index = [datetime(2014, 1, 1), datetime(2014, 2, 1), datetime(2014, 4, 1)])
ts

In [None]:
# 一單位變化了 (2 - 1)/(2 - 0) = 0.5 個值
# 預設是看位置
# 但如果考慮到日期索引，有更好的方式
ts.interpolate() 

In [None]:
# 方法改成看時間
# 所以一天變化了 (2 - 1)/(datetime(2014, 4, 1) - datetime(2014, 1, 1)) 個值
# 再看 1/1 到 2/1 過了幾天!
ts.interpolate(method = 'time') 

---

In [None]:
s = pd.Series([0, np.nan, 100], index = [0, 1, 10])
s

In [None]:
# 預設是以位置做內插
# 一單位變化了 (100 - 0)/(2 - 0) = 50 個值
s.interpolate() 

In [None]:
# 按照索引值做內插 
# 所以一單位變化了 (100 - 0)/(10 - 0) = 10 個值
s.interpolate(method = 'values') 

## 處理重複資料

In [None]:
data = pd.DataFrame({'a': ['x']*3 + ['y']*4,
                     'b': [1, 1, 2, 3, 3, 4, 4]})
data

In [None]:
data.duplicated() # 由上往下看是否有重複列

In [None]:
# 有 in-place 參數可供使用
# 預設 keep = 'first'
# 也可以用 data[~data.duplicated()]
data.drop_duplicates()  

In [None]:
data.drop_duplicates(keep = 'last')

---

In [None]:
data['c'] = range(7)
data

In [None]:
data.duplicated()

In [None]:
data.drop_duplicates(['a', 'b']) # 只看 a 與 b 行來決定

## 資料轉換

### 將資料映射成不同的值

In [None]:
x = pd.Series({'one': 1, 'two': 2, 'three': 3})
x

In [None]:
y = pd.Series({1: 'a', 2: 'b', 3: 'c'})
y

In [None]:
# 順便想想 python 是如何操作 map() 函式
# 在 mod02 我們有討論過 # series.map() 作用在值上，沒有資料框方法
# 裡面放函數，然後作用在每個值上 # 如果放序列也是類似的
x.map(y)

---

In [None]:
x = pd.Series({'one': 1, 'two': 2, 'three':3}) 
y = pd.Series({1:'a', 2:'b'})
x

In [None]:
y

In [None]:
x.map(y) # 沒有對應關係會出現 NaN

---

In [None]:
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon',
                              'Pastrami', 'corned beef', 'Bacon',
                              'pastrami', 'honey ham', 'nova lox'],
                     'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data

In [None]:
meat_to_animal = {
  'bacon': 'pig',
  'pulled pork': 'pig',
  'pastrami': 'cow',
  'corned beef': 'cow',
  'honey ham': 'pig',
  'nova lox': 'salmon'
}
meat_to_animal

In [None]:
# 利用 .str 屬性會拿到字串物件，再利用字串的 lower() 方法
# 會一一對每個值操作 .lower() 方法
lowercased = data['food'].str.lower()
lowercased

In [None]:
data['animal'] = lowercased.map(meat_to_animal)
data

In [None]:
# 另一種寫法得到同樣結果 
# 也可以用 .apply() 方法
# df.apply() 方法作用在序列，series.apply() 方法作用在值上
data['food'].map(lambda x: meat_to_animal[x.lower()])

### 值的取代

In [None]:
s = pd.Series([0, 1, 2, 3, 2, 4], index = np.arange(2, 8))
s

In [None]:
s.replace(2, 5) # 值是 2 的換成 5 # not in-place

In [None]:
s

---

In [None]:
s.replace([2, 3], np.nan) # 值是 2 或 3 的都換成遺失值

In [None]:
s.replace([3, 4, 0], [7, 8, 1]) # 值是 3 的換成 7，值是 4 的換成 8，值是 0 的換成 1

In [None]:
# 也可用字典表達
# 值 0 換成 100，值 3 換成 300
s.replace({0: 100, 3: 300}) 

---

In [None]:
ss = s.copy()
ss[3] = np.nan
ss

In [None]:
ss.replace(np.nan, 0, inplace = True)

In [None]:
ss

---

In [None]:
df = pd.DataFrame({'a': [0, 1, 2, 3, 4], 
                   'b': [5, 6, 7, 1, 9]})
df

In [None]:
# 資料框的 .replace() 方法，把值 1 的都換 10
df.replace(1, 10)

In [None]:
# 資料框的 .replace() 方法，把欄位 a 值是 1 跟欄位 b 值是 9 的都換 10
df.replace({'a': 2, 'b': 9}, 10)

---

In [None]:
s

In [None]:
# 值是 0 或 4 的，用 ffill 方法填入
s.replace([0, 4], method = 'ffill')

In [None]:
# 值是 0 或 4 的，用 bfill 方法填入
s.replace([0, 4], method = 'bfill')

### 更名軸 index

In [None]:
data = pd.DataFrame(np.arange(12).reshape((3, 4)),
                    index = ['Ohio', 'Colorado', 'New York'],
                    columns = ['one', 'two', 'three', 'four'])
data2 = data.copy()
data2

In [None]:
data2.columns = ['col1', 'col2', 'col3', 'col4']
data2.index = ['a', 'b', 'c']
data2

---

In [None]:
data

In [None]:
transform = lambda x: x[:4].upper()

In [None]:
data.index.map(transform)

In [None]:
data.index = data.index.map(transform)

In [None]:
data

---

In [None]:
data.rename(index = str.title, columns = str.upper) # index 與 columns 參數也可以接函式或方法

In [None]:
# 用 .rename() 方法可以只單獨改某幾個索引標籤
data.rename(index = {'OHIO': 'INDIANA'},
            columns = {'three': 'peekaboo'}) # 參數可以接字典

In [None]:
data

In [None]:
data.rename(index = {'OHIO': 'INDIANA'},
            columns = {'three': 'peekaboo'}, inplace = True)
data

### 離散化和分組

In [None]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
ages

In [None]:
bins = [18, 25, 35, 60, 100]
cats = pd.cut(ages, bins)
cats

In [None]:
cats.categories

In [None]:
cats.codes

In [None]:
cats.value_counts()

In [None]:
cats.describe() 

---

In [None]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
df = pd.DataFrame(ages, columns = ['age'] )
df

In [None]:
bins = [18, 25, 35, 60, 100]
labels = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']
pd.cut(df['age'], bins = bins, labels = labels )

In [None]:
df['age_cat'] = pd.cut(df['age'], bins = bins, labels = labels )
df

In [None]:
df['age_cat'].describe()

In [None]:
df['age_cat'].value_counts()  # 更多內容請參考類別資料的章節

In [None]:
df.sort_values(by = "age_cat")

### 偵測和濾除離群值

In [None]:
np.random.seed(123456)
data = pd.DataFrame(np.random.randn(1000, 4)) # 母體平均值是 0，標準差是 1
data[:10]

In [None]:
data.describe()

In [None]:
data_cp = data.copy()
data_cp

In [None]:
# 找出第二欄位減去平均值的絕對值大於 3 個標準差的就是離群值
mask = np.abs(data_cp[2] - data_cp[2].mean()) > 3*data_cp[2].std() 
data_cp[2][mask]  

In [None]:
# 一種作法去除離群值
data_cp[~mask]

---

In [None]:
data_cp

In [None]:
# 找出那些列包含大於 3 個標準差的值
# 一種做法是把這些有包含離群值的列都刪除
for i in data_cp.columns:
    mask = np.abs(data_cp[i] - data_cp[i].mean()) > 3*data_cp[i].std()
    data_cp = data_cp[~mask]

In [None]:
data_cp

In [None]:
# 我們把有包含超過 3 個標準差的值的列都刪掉了
# 濾除離群值，通常利於之後的分析
data_cp.describe()

In [None]:
# 跟沒有刪除離群值的統計量比較一下
# 刪除離群值的平均值跟標準差估的比較準一點
data.describe()

### 排列與隨機取樣

In [None]:
df = pd.DataFrame(np.arange(20).reshape(5, 4), index = list('abced'))
df

In [None]:
# 把 0, 1, 2, 3, 4 隨機排列，也就是洗牌的意思
samper = np.random.permutation(len(df))
samper

In [None]:
df.iloc[samper]

In [None]:
# 跟上面一樣結果
# 回憶一下之前學過的 series.take()
# Return the elements in the given *positional* indices along an axis.
df.take(samper)

---

In [None]:
# 隨機選出三列，預設 replace = False，也就是取後不放回
df.sample(n = 3)

In [None]:
# 預設 replace = False，也就是取後不放回
# 就跟剛剛用 permutation 的效果是一樣
df.sample(n = len(df))

In [None]:
# 取後放回
# 可參考我們 mod03，之前有稍微提過 .sample() 方法，複習一下
df.sample(n = 10, replace = True)

### 指標 (indicator) 與虛擬變數 (dummy)

In [None]:
df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                   'data1': range(6)})
df

In [None]:
df['key']

In [None]:
dummies = pd.get_dummies(df['key'], prefix = 'key') # 在欄位加上前綴
dummies

In [None]:
# 注意 df[['data1']] 的寫法，出來會是一個資料框
# 因為只有資料框才有 .join() 方法，不然用 pd.concat() 函式也可以
# 資料框的 .join() 方法之後有章節會教到
df_with_dummy = df[['data1']].join(dummies) 
df_with_dummy

In [None]:
pd.concat([df[['data1']], dummies], axis = 1)

---

In [None]:
mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table('./mod06/movies.dat', sep='::',
                       header = None, names = mnames, encoding = "ISO-8859-1", engine = 'python') # 注意編碼跟引擎
movies

In [None]:
# 也可以用 movies.geanres.str.split("|")，再搭配迴圈
all_genres = []
for i in movies.genres:
    all_genres.extend(i.split('|'))
all_genres

In [None]:
all_genres = []
for i in movies.genres:
    all_genres.extend(i.split('|'))
all_genres

In [None]:
# 用 pd.unique() 函式找出到底有哪些不重複的電影類型
genres = pd.unique(all_genres)
genres

In [None]:
# 準備來建立虛擬變數
zero_matrix = np.zeros((len(movies), len(genres)))
dummies = pd.DataFrame(zero_matrix, columns = genres)
dummies

In [None]:
gen = movies.genres[0].split('|')
gen

In [None]:
# index 物件有 .get_indexer() 方法，給索引標籤會返回位置
dummies.columns.get_indexer(gen)

In [None]:
# 其實也可以不用轉成位置，直接用索引標籤操作
for i, gen in enumerate(movies.genres):
    indices =  dummies.columns.get_indexer(gen.split('|'))
    dummies.iloc[i][indices] = 1

In [None]:
dummies

In [None]:
# 也可以用 pd.concat([movies, dummies.add_prefix('Genre')], axis = 1)
movies_windic = movies.join(dummies.add_prefix('Genre')) # 資料框有 add.prefix() 方法，幫你把欄位名加前綴字
movies_windic

In [None]:
movies_windic.iloc[0]

---

In [None]:
np.random.seed(12345)
values = np.random.rand(10)
values

In [None]:
bins = [0, 0.2, 0.4, 0.6, 0.8, 1]

In [None]:
# 這是類別物件
cat = pd.cut(values, bins = bins)
cat

In [None]:
cat.value_counts()

In [None]:
cat.describe()

In [None]:
# pd.get_dummies 函式放入類別物件的效果
pd.get_dummies(cat)

### 套用函數來轉換資料

In [None]:
# 在 mod02 已經有學過，可以複習一下
s = pd.Series(np.arange(0, 5))
s

In [None]:
# df.apply() 方法作用在序列，series.apply() 方法作用在值上
# 這裡用 map() 方法效果會一樣
s.apply(lambda x: x*2)

---

In [None]:
df = pd.DataFrame(np.arange(12).reshape(4, 3),
                 columns = ['a', 'b', 'c'])
df

In [None]:
df.apply(lambda x: x.sum()) # 預設 axis = 0

In [None]:
df.apply(lambda x: x.sum(), axis = 1)

In [None]:
# 注意這裡的軸容易被誤導
# axis = 0 or ‘index’: apply function to each column.
# axis = 1 or ‘columns’: apply function to each row.
df['interim'] = df.apply(lambda x: x['a']*x['b'], axis = 1)
df

In [None]:
df['result'] = df.apply(lambda x: x['c'] + x['interim'], axis = 1)
df

---

In [None]:
df = pd.DataFrame(np.arange(0, 15).reshape(3, 5))
df.loc[1, 2] = np.nan
df

In [None]:
df.dropna().apply(lambda x: x.sum(), axis = 1)

---

In [None]:
df

In [None]:
df.applymap(lambda x: '%.2f' % x)

In [None]:
# 跟上面一樣是字串格式化，也可以用 f-string
df.applymap(lambda x: '{:.2f}'.format(x))

## 字串操作

### 字串物件的方法

In [None]:
val = 'a, b, guido'
val.split(',')

In [None]:
pieces = [i.strip(' ') for i in val.split(',')]
pieces

In [None]:
# 可能會想把他們用符號 :: 連起來
first, second, third = pieces
first + '::' + second + '::' + third

In [None]:
'::'.join(pieces) # 實務上比較常用的做法

---

In [None]:
# 成員運算子
'guido' in val

In [None]:
# 字串的 .index 方法
val.index(',')

In [None]:
val.find(':') # -1 表示找不掉

In [None]:
val.index(':') # .find() 方法跟 .index() 方法的差別

In [None]:
val.count(',')

In [None]:
val.replace(',', '::')

In [None]:
val.replace(',', '')

### 正規表達式

In [None]:
text = "foo    bar\t baz  \tqux"
text

In [None]:
# 滿足正規表達式的切割符號，會返回列表
re.split(r'\s+', text)

---

In [None]:
regex = re.compile(r'\s+') # 手動呼叫 re.compile() 來編譯一個 regex，這樣可以有重複可用的 regex 物件

In [None]:
regex.split(text)

In [None]:
regex.findall(text)

---

In [None]:
text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""

In [None]:
# 有加 flags = re.IGNORECASE，我們就可以不用再加入小寫
# 另一種寫法: re.findall(pattern, text, flags=re.IGNORECASE)
pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'
regex = re.compile(pattern, flags = re.IGNORECASE)
regex.findall(text)

In [None]:
m = regex.search(text)
m

In [None]:
m = regex.match(text)
m

In [None]:
# redacted 的意思是為了模糊或刪除敏感信息而做出特殊編輯的
print(regex.sub('REDACTED', text))

In [None]:
text

In [None]:
# 不要忘記 flags= re.IGNORECASE，不然就一開始就把大小寫都加進 pattern 中
print(re.sub(pattern, 'REDACTED', text, flags= re.IGNORECASE))

---

In [None]:
# 用括號 () 來分組
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
regex = re.compile(pattern, flags=re.IGNORECASE)

In [None]:
m = regex.match('wesm@bright.net')
m.groups()

In [None]:
# group() 與 group(0) 傳回完整的搜尋字串
m.group(0)

In [None]:
m.group(1)

In [None]:
m.group(2)

In [None]:
m.group(3)

In [None]:
regex.findall(text)

In [None]:
print(regex.sub(r'Username: \1, Domain: \2, Suffix: \3', text)) # \1、\2、\3 這樣特別的符號是來存取分段的結果

### pandas 中向量字串函式

In [None]:
data = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com',
        'Rob': 'rob@gmail.com', 'Wes': np.nan}
data = pd.Series(data)
data

In [None]:
data.isnull()

In [None]:
# 遇到遺失值，直接用 .map() 或 .apply() 方法等都會產生例外喔!
# 所以要加個條件式過濾
# 不能寫 x == np.nan ，過濾遺失值會失敗
def func(x):
    if x is np.nan:
        return x
    else:
        pattern = r'gmail'
        m = re.search(pattern, x)
        if m:
            return True
        else:
            return False

data.map(func)

In [None]:
data.str.contains('gmail')

In [None]:
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
data.str.findall(pattern, flags = re.IGNORECASE)

In [None]:
# 利用可以分組的正規表達式，從字串 Series 中取出一個或多個字串，輸出結果是一個資料框，每個分組會有一欄
# 一定要分組，也就是給括號
matches = data.str.extract(pattern, flags = re.IGNORECASE)
matches

In [None]:
matches[0]

In [None]:
matches.iloc[:, 1]

---

In [None]:
# 拿到字串物件後會對裡面每個字串做切片
data.str[:5]

---

In [None]:
s = pd.Series(['**Crystal;L123  ', ' Matt;L456 ', 'Alice;L789 '])
s

In [None]:
s = s.str.strip('*').str.strip(' ')
s

In [None]:
s = s.str.split(';')
s

In [None]:
df = pd.DataFrame(np.zeros((len(s), 2)))
df

In [None]:
df.iloc[0] = s[0]
df

In [None]:
for i in range(len(s)):
    df.iloc[i] = s[i]
df

---

In [None]:
s = pd.Series(['**Crystal;L123  ', ' Matt;L456 ', 'Alice;L789 '])
s

In [None]:
pattern = r"([A-Za-z]+);([A-Z0-9]+)"
s.str.extract(pattern)

## 綜合應用

In [None]:
string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data

In [None]:
string_data.isnull()

In [None]:
string_data[0] = None

In [None]:
string_data

In [None]:
string_data.isnull() # None 會被當成 NaN