#  <font color=red> Module_04_類別資料</font>

## 分類的背景、動機、好處

In [None]:
import pandas as pd
import numpy as np

values = pd.Series(['apple', 'orange', 'apple', 'apple']*2)
values

In [None]:
values.unique() # 也可用 pd.unique() 函式

In [None]:
values.value_counts() # 也可用 pd.value_counts() 函式

---

In [None]:
values = pd.Series([0, 1, 0, 0]*2)
values

In [None]:
# 維度表 (dimension table)
# 維度表讓我們知道 0 對應到 apple，1 對應到 orange
dim = pd.Series(['apple', 'orange']) 
dim

In [None]:
# 用 .take() 方法還原
dim.take(values)

---

In [None]:
N = 10000000

In [None]:
labels = pd.Series(['foo', 'bar', 'baz', 'qux']*(N//4))
labels

In [None]:
categories = labels.astype('category')
categories

In [None]:
labels.memory_usage()

In [None]:
categories.memory_usage() # 分類過後的序列對記憶體的消耗低很多

In [None]:
%timeit _ = labels.astype('category') # 但分類的成本是時間成本

## 建立類別物件

In [None]:
# 類別物件是用來表示類別變數
lmh_values = ['low', 'high', 'medium', 'medium', 'high']
lmh_cat = pd.Categorical(lmh_values)
lmh_cat 

In [None]:
# 類別物件
type(lmh_cat)

In [None]:
lmh_cat.categories # 可用屬性 .categories 來檢視有哪些類別

In [None]:
# 類別的順序可能不是妳要的
lmh_cat.codes

---

In [None]:
# 用 categories 參數決定妳要的順序
# 還可以用 ordered 參數來決定大小
lmh_cat = pd.Categorical(lmh_values, categories = ['low', 'medium', 'high'] ) 
lmh_cat

In [None]:
lmh_cat.codes

---

In [None]:
# 類別資料的方法，會依照編碼來排序
# 如果是存成字串，使用 .sort_values() 方法會是用第一個字元來排序
lmh_cat.sort_values()

In [None]:
# 要設定 ordered 才能使用 max()、min() 等方法
# 此時類別有順序關係
lmh_cat = lmh_cat.as_ordered() 
lmh_cat

In [None]:
lmh_cat.min()

In [None]:
lmh_cat.max()

---

In [None]:
lmh_values

In [None]:
s = pd.Series(lmh_values)
s

In [None]:
# 如果是存成字串，使用 .sort_values() 方法會是用第一個字元來排序
s.sort_values()

---

In [None]:
# 產生元素是類別資料的序列，也就是裡面的值是我們剛剛學的類別物件
# 不用 dtype的話，也可以 lmh_values 就直接放類別物件
cat_series = pd.Series(lmh_values, dtype = 'category') 
cat_series

In [None]:
cat_series.values

In [None]:
# 序列的換類別順序的方法
cut_dtype = pd.CategoricalDtype(['low', 'medium', 'high'], ordered = True)
cat_series = cat_series.astype(cut_dtype)
cat_series    

In [None]:
# 這才是我們想要的結果
cat_series.sort_values()

---

In [None]:
# 再回到原來
cat_series = pd.Series(lmh_values, dtype = 'category')
cat_series

In [None]:
# 注意這還是序列，只是裡面的資料是類別資料
# 所以沒辦法直接用類別物件的方法與屬性
# 解決方法是用序列的 .cat 屬性
cat_series

In [None]:
cat_series.cat # 此物件能讓我們讀取底層類別物件的各項屬性

In [None]:
cat_series.cat.categories

In [None]:
cat_series.cat.codes

In [None]:
# 序列的另一種換類別順序的方法
# 利用類別物件的 .set_categories() 方法
cat_series = cat_series.cat.set_categories(['low', 'medium', 'high'])
cat_series

In [None]:
cat_series.cat.codes

In [None]:
cat_series.cat.as_ordered()

---

In [None]:
np.random.seed(123456)
values = np.random.randint(0, 100, 5)
bins = pd.DataFrame({'Value': values})
bins

In [None]:
bins['Group'] = pd.cut(bins['Value'].values, bins = range(0, 110, 10)) # pd.cut() 函式會回傳類別物件
bins

In [None]:
# 注意它分成 10 個桶子 
bins.Group

In [None]:
# 注意類別資料就算 0 次也會顯示
bins.Group.value_counts()

In [None]:
bins.sort_values(by = 'Group', ascending = False)

---

In [None]:
# 建立有序的類別物件
metal_values = ['bronze', 'gold', 'silver', 'bronze']
metal_categories = ['bronze', 'silver', 'gold']
metals = pd.Categorical(metal_values, 
                        categories = metal_categories, 
                        ordered = True) 
metals

In [None]:
# 再建一個跟剛剛的類別物件裡面的值顛倒的
metals_reversed_values = pd.Categorical(metals[::-1],
                         categories = metal_categories,
                         ordered = True)
metals_reversed_values

In [None]:
# 有序的物件讓不同的類別物件可以比較
# 他們是在比編碼大小
metals < metals_reversed_values 

In [None]:
metals.codes

In [None]:
metals_reversed_values.codes

---

In [None]:
metal_categories

In [None]:
pd.Categorical(['bronze', 'copper'], categories = metal_categories) # 這個技巧可以在建立類別物件時，過濾掉不適當的類別值

---

In [None]:
categories = ['foo', 'bar', 'baz']
codes = [0, 1, 2, 0, 0, 1]

In [None]:
my_cat = pd.Categorical.from_codes(codes = codes, categories = categories)
my_cat

In [None]:
# 忘記上面的方法用個 List comprehension 就來輕鬆復原，再用 pd.Categorical() 函式來建立類別物件
tmp = [categories[i] for i in codes]
tmp

In [None]:
pd.Categorical(tmp, categories = categories)

## 重新命名類別

In [None]:
cat = pd.Categorical(['a', 'b', 'c', 'a'],
                   categories = ['a', 'b', 'c'])
cat

In [None]:
cat.categories = ['bronze', 'silver', 'gold'] # in-place
cat

In [None]:
cat.rename_categories(['x', 'y', 'z']) # not in-place

In [None]:
cat

## 附加新類別

In [None]:
# 建立有序的類別物件
metal_values = ['bronze', 'gold', 'silver', 'bronze']
metal_categories = ['bronze', 'silver', 'gold']
metals = pd.Categorical(metal_values, 
                        categories = metal_categories, 
                        ordered = True) 
metals

In [None]:
with_platinum = metals.add_categories(['platinum'])
with_platinum

## 移除類別

In [None]:
no_bronze = metals.remove_categories(['bronze'])
no_bronze

## 移除未使用的類別

In [None]:
with_platinum

In [None]:
with_platinum.remove_unused_categories()

## 設定類別

In [None]:
s = pd.Series(['one', 'two', 'four', 'five'], dtype = 'category')
s

In [None]:
s = s.cat.set_categories(['one', 'four'])
s

## 類別物件的敘述性資訊

In [None]:
metals

In [None]:
metals.describe() # 類別物件的敘述性資訊

In [None]:
# 若是有 0 的桶子也會被顯示
metals.value_counts()

In [None]:
metals.min()

In [None]:
metals.max()

In [None]:
metals.mode()

## 學校成績轉換

In [None]:
np.random.seed(123456)
names = ['Ivana', 'Norris', 'Ruth', 'Lane', 'Skye', 'Sol', 'Dylan', 'Katina', 'Alissa', "Marc"]
grades = np.random.randint(50, 101, len(names))
scores = pd.DataFrame({'Name': names, 'Grade': grades})
scores

In [None]:
score_bins = [ 0,  59,   62,  66,   69,   72,  76,   79,   82, 86,   89,   92,  99, 100]
letter_grades = ['F', 'D-', 'D', 'D+', 'C-', 'C', 'C+', 'B-', 'B', 'B+', 'A-', 'A', 'A+']

In [None]:
letter_cats = pd.cut(scores.Grade, bins = score_bins, labels = letter_grades)
scores['Letter'] = letter_cats
scores

In [None]:
scores.Letter

In [None]:
# 這是序列的 .describe() 方法，跟類別物件的 .describe() 方法呈現的稍有不同
scores.Letter.describe()

In [None]:
# 沒出現的類別也會顯示，顯示 0
scores.Letter.value_counts()

In [None]:
scores.sort_values(by = ['Letter'], ascending = False )

## 綜合應用

In [None]:
fruits = ['apple', 'orange', 'apple', 'apple']*2
N = len(fruits)
df = pd.DataFrame({'fruit': fruits,
                   'basket_id': np.arange(N),
                   'count': np.random.randint(3, 15, size = N),
                   'weight': np.random.uniform(0, 4, size = N)},
                   columns = ['basket_id', 'fruit', 'count', 'weight']) # 這裡的 columns 決定了順序
df

In [None]:
fruit_cat = df['fruit'].astype('category')
fruit_cat

In [None]:
c = fruit_cat.values # 值不是 NumPy 陣列，而是 pandas.Categorial 實例
type(c)

In [None]:
c.categories

In [None]:
c.codes

In [None]:
# 看是哪個資料型態的 .take() 方法，來得知返回的資料型態
c.categories.take(c.codes)

---

In [None]:
np.random.seed(12345)
draws = np.random.randn(1000)
draws[:5]

In [None]:
bins = pd.qcut(draws, q = 4)
bins

In [None]:
bins.describe()

In [None]:
bins = pd.qcut(draws, 4, labels = ['Q1', 'Q2', 'Q3', 'Q4'])
bins

In [None]:
bins.describe()

In [None]:
bins.codes[:10]

In [None]:
bins = pd.Series(bins, name = 'quantile')
bins

In [None]:
pd.Series(draws)

In [None]:
results = pd.Series(draws).groupby(bins).agg(['count', 'min', 'max'])
results

In [None]:
results = results.reset_index()
results

In [None]:
results['quantile']

---

In [None]:
s_cat = pd.Series(['a', 'b', 'c', 'd']*2, dtype = 'category')
s_cat

In [None]:
s_cat.cat.codes

In [None]:
# 為建模而建立 dummy 變數
# one-hot 編碼
# 可以跟 s_cat.cat.codes 做比較
pd.get_dummies(s_cat) 