#  <font color=red> Module_04_類別資料</font>

## 分類的背景、動機、好處

In [1]:
import pandas as pd
import numpy as np

values = pd.Series(['apple', 'orange', 'apple', 'apple']*2)
values

0     apple
1    orange
2     apple
3     apple
4     apple
5    orange
6     apple
7     apple
dtype: object

In [2]:
values.unique() # 也可用 pd.unique() 函式

array(['apple', 'orange'], dtype=object)

In [3]:
values.value_counts() # 也可用 pd.value_counts() 函式

apple     6
orange    2
dtype: int64

---

In [4]:
values = pd.Series([0, 1, 0, 0]*2)
values

0    0
1    1
2    0
3    0
4    0
5    1
6    0
7    0
dtype: int64

In [5]:
# 維度表 (dimension table)
# 維度表讓我們知道 0 對應到 apple，1 對應到 orange
dim = pd.Series(['apple', 'orange']) 
dim

0     apple
1    orange
dtype: object

In [6]:
# 用 .take() 方法還原
dim.take(values)

0     apple
1    orange
0     apple
0     apple
0     apple
1    orange
0     apple
0     apple
dtype: object

---

In [7]:
N = 10000000

In [8]:
labels = pd.Series(['foo', 'bar', 'baz', 'qux']*(N//4))
labels

0          foo
1          bar
2          baz
3          qux
4          foo
          ... 
9999995    qux
9999996    foo
9999997    bar
9999998    baz
9999999    qux
Length: 10000000, dtype: object

In [9]:
categories = labels.astype('category')
categories

0          foo
1          bar
2          baz
3          qux
4          foo
          ... 
9999995    qux
9999996    foo
9999997    bar
9999998    baz
9999999    qux
Length: 10000000, dtype: category
Categories (4, object): ['bar', 'baz', 'foo', 'qux']

In [10]:
labels.memory_usage()

80000128

In [11]:
categories.memory_usage() # 分類過後的序列對記憶體的消耗低很多

10000332

In [12]:
%timeit _ = labels.astype('category') # 但分類的成本是時間成本

336 ms ± 9.82 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## 建立類別物件

In [13]:
# 類別物件是用來表示類別變數
lmh_values = ['low', 'high', 'medium', 'medium', 'high']
lmh_cat = pd.Categorical(lmh_values)
lmh_cat 

['low', 'high', 'medium', 'medium', 'high']
Categories (3, object): ['high', 'low', 'medium']

In [14]:
# 類別物件
type(lmh_cat)

pandas.core.arrays.categorical.Categorical

In [15]:
lmh_cat.categories # 可用屬性 .categories 來檢視有哪些類別

Index(['high', 'low', 'medium'], dtype='object')

In [16]:
# 類別的順序可能不是妳要的
lmh_cat.codes

array([1, 0, 2, 2, 0], dtype=int8)

---

In [17]:
# 用 categories 參數決定妳要的順序
# 還可以用 ordered 參數來決定大小
lmh_cat = pd.Categorical(lmh_values, categories = ['low', 'medium', 'high'] ) 
lmh_cat

['low', 'high', 'medium', 'medium', 'high']
Categories (3, object): ['low', 'medium', 'high']

In [18]:
lmh_cat.codes

array([0, 2, 1, 1, 2], dtype=int8)

---

In [19]:
# 類別資料的方法，會依照編碼來排序
# 如果是存成字串，使用 .sort_values() 方法會是用第一個字元來排序
lmh_cat.sort_values()

['low', 'medium', 'medium', 'high', 'high']
Categories (3, object): ['low', 'medium', 'high']

In [20]:
# 要設定 ordered 才能使用 max()、min() 等方法
# 此時類別有順序關係
lmh_cat = lmh_cat.as_ordered() 
lmh_cat

['low', 'high', 'medium', 'medium', 'high']
Categories (3, object): ['low' < 'medium' < 'high']

In [21]:
lmh_cat.min()

'low'

In [22]:
lmh_cat.max()

'high'

---

In [23]:
lmh_values

['low', 'high', 'medium', 'medium', 'high']

In [24]:
s = pd.Series(lmh_values)
s

0       low
1      high
2    medium
3    medium
4      high
dtype: object

In [25]:
# 如果是存成字串，使用 .sort_values() 方法會是用第一個字元來排序
s.sort_values()

1      high
4      high
0       low
2    medium
3    medium
dtype: object

---

In [26]:
# 產生元素是類別資料的序列，也就是裡面的值是我們剛剛學的類別物件
# 不用 dtype的話，也可以 lmh_values 就直接放類別物件
cat_series = pd.Series(lmh_values, dtype = 'category') 
cat_series

0       low
1      high
2    medium
3    medium
4      high
dtype: category
Categories (3, object): ['high', 'low', 'medium']

In [27]:
cat_series.values

['low', 'high', 'medium', 'medium', 'high']
Categories (3, object): ['high', 'low', 'medium']

In [28]:
# 序列的換類別順序的方法
cut_dtype = pd.CategoricalDtype(['low', 'medium', 'high'], ordered = True)
cat_series = cat_series.astype(cut_dtype)
cat_series    

0       low
1      high
2    medium
3    medium
4      high
dtype: category
Categories (3, object): ['low' < 'medium' < 'high']

In [29]:
# 這才是我們想要的結果
cat_series.sort_values()

0       low
2    medium
3    medium
1      high
4      high
dtype: category
Categories (3, object): ['low' < 'medium' < 'high']

---

In [30]:
# 再回到原來
cat_series = pd.Series(lmh_values, dtype = 'category')
cat_series

0       low
1      high
2    medium
3    medium
4      high
dtype: category
Categories (3, object): ['high', 'low', 'medium']

In [31]:
# 注意這還是序列，只是裡面的資料是類別資料
# 所以沒辦法直接用類別物件的方法與屬性
# 解決方法是用序列的 .cat 屬性
cat_series

0       low
1      high
2    medium
3    medium
4      high
dtype: category
Categories (3, object): ['high', 'low', 'medium']

In [32]:
cat_series.cat # 此物件能讓我們讀取底層類別物件的各項屬性

<pandas.core.arrays.categorical.CategoricalAccessor object at 0x000002079415DE80>

In [33]:
cat_series.cat.categories

Index(['high', 'low', 'medium'], dtype='object')

In [34]:
cat_series.cat.codes

0    1
1    0
2    2
3    2
4    0
dtype: int8

In [35]:
# 序列的另一種換類別順序的方法
# 利用類別物件的 .set_categories() 方法
cat_series = cat_series.cat.set_categories(['low', 'medium', 'high'])
cat_series

0       low
1      high
2    medium
3    medium
4      high
dtype: category
Categories (3, object): ['low', 'medium', 'high']

In [36]:
cat_series.cat.codes

0    0
1    2
2    1
3    1
4    2
dtype: int8

In [37]:
cat_series.cat.as_ordered()

0       low
1      high
2    medium
3    medium
4      high
dtype: category
Categories (3, object): ['low' < 'medium' < 'high']

---

In [38]:
np.random.seed(123456)
values = np.random.randint(0, 100, 5)
bins = pd.DataFrame({'Value': values})
bins

Unnamed: 0,Value
0,65
1,49
2,56
3,43
4,43


In [39]:
bins['Group'] = pd.cut(bins['Value'].values, bins = range(0, 110, 10)) # pd.cut() 函式會回傳類別物件
bins

Unnamed: 0,Value,Group
0,65,"(60, 70]"
1,49,"(40, 50]"
2,56,"(50, 60]"
3,43,"(40, 50]"
4,43,"(40, 50]"


In [40]:
# 注意它分成 10 個桶子 
bins.Group

0    (60, 70]
1    (40, 50]
2    (50, 60]
3    (40, 50]
4    (40, 50]
Name: Group, dtype: category
Categories (10, interval[int64, right]): [(0, 10] < (10, 20] < (20, 30] < (30, 40] ... (60, 70] < (70, 80] < (80, 90] < (90, 100]]

In [41]:
# 注意類別資料就算 0 次也會顯示
bins.Group.value_counts()

(40, 50]     3
(50, 60]     1
(60, 70]     1
(0, 10]      0
(10, 20]     0
(20, 30]     0
(30, 40]     0
(70, 80]     0
(80, 90]     0
(90, 100]    0
Name: Group, dtype: int64

In [42]:
bins.sort_values(by = 'Group', ascending = False)

Unnamed: 0,Value,Group
0,65,"(60, 70]"
2,56,"(50, 60]"
1,49,"(40, 50]"
3,43,"(40, 50]"
4,43,"(40, 50]"


---

In [43]:
# 建立有序的類別物件
metal_values = ['bronze', 'gold', 'silver', 'bronze']
metal_categories = ['bronze', 'silver', 'gold']
metals = pd.Categorical(metal_values, 
                        categories = metal_categories, 
                        ordered = True) 
metals

['bronze', 'gold', 'silver', 'bronze']
Categories (3, object): ['bronze' < 'silver' < 'gold']

In [44]:
# 再建一個跟剛剛的類別物件裡面的值顛倒的
metals_reversed_values = pd.Categorical(metals[::-1],
                         categories = metal_categories,
                         ordered = True)
metals_reversed_values

['bronze', 'silver', 'gold', 'bronze']
Categories (3, object): ['bronze' < 'silver' < 'gold']

In [45]:
# 有序的物件讓不同的類別物件可以比較
# 他們是在比編碼大小
metals < metals_reversed_values 

array([False, False,  True, False])

In [46]:
metals.codes

array([0, 2, 1, 0], dtype=int8)

In [47]:
metals_reversed_values.codes

array([0, 1, 2, 0], dtype=int8)

---

In [48]:
metal_categories

['bronze', 'silver', 'gold']

In [49]:
pd.Categorical(['bronze', 'copper'], categories = metal_categories) # 這個技巧可以在建立類別物件時，過濾掉不適當的類別值

['bronze', NaN]
Categories (3, object): ['bronze', 'silver', 'gold']

---

In [50]:
categories = ['foo', 'bar', 'baz']
codes = [0, 1, 2, 0, 0, 1]

In [51]:
my_cat = pd.Categorical.from_codes(codes = codes, categories = categories)
my_cat

['foo', 'bar', 'baz', 'foo', 'foo', 'bar']
Categories (3, object): ['foo', 'bar', 'baz']

In [52]:
# 忘記上面的方法用個 List comprehension 就來輕鬆復原，再用 pd.Categorical() 函式來建立類別物件
tmp = [categories[i] for i in codes]
tmp

['foo', 'bar', 'baz', 'foo', 'foo', 'bar']

In [53]:
pd.Categorical(tmp, categories = categories)

['foo', 'bar', 'baz', 'foo', 'foo', 'bar']
Categories (3, object): ['foo', 'bar', 'baz']

## 重新命名類別

In [54]:
cat = pd.Categorical(['a', 'b', 'c', 'a'],
                   categories = ['a', 'b', 'c'])
cat

['a', 'b', 'c', 'a']
Categories (3, object): ['a', 'b', 'c']

In [55]:
cat.categories = ['bronze', 'silver', 'gold'] # in-place
cat

['bronze', 'silver', 'gold', 'bronze']
Categories (3, object): ['bronze', 'silver', 'gold']

In [56]:
cat.rename_categories(['x', 'y', 'z']) # not in-place

['x', 'y', 'z', 'x']
Categories (3, object): ['x', 'y', 'z']

In [57]:
cat

['bronze', 'silver', 'gold', 'bronze']
Categories (3, object): ['bronze', 'silver', 'gold']

## 附加新類別

In [58]:
# 建立有序的類別物件
metal_values = ['bronze', 'gold', 'silver', 'bronze']
metal_categories = ['bronze', 'silver', 'gold']
metals = pd.Categorical(metal_values, 
                        categories = metal_categories, 
                        ordered = True) 
metals

['bronze', 'gold', 'silver', 'bronze']
Categories (3, object): ['bronze' < 'silver' < 'gold']

In [59]:
with_platinum = metals.add_categories(['platinum'])
with_platinum

['bronze', 'gold', 'silver', 'bronze']
Categories (4, object): ['bronze' < 'silver' < 'gold' < 'platinum']

## 移除類別

In [60]:
no_bronze = metals.remove_categories(['bronze'])
no_bronze

[NaN, 'gold', 'silver', NaN]
Categories (2, object): ['silver' < 'gold']

## 移除未使用的類別

In [61]:
with_platinum

['bronze', 'gold', 'silver', 'bronze']
Categories (4, object): ['bronze' < 'silver' < 'gold' < 'platinum']

In [62]:
with_platinum.remove_unused_categories()

['bronze', 'gold', 'silver', 'bronze']
Categories (3, object): ['bronze' < 'silver' < 'gold']

## 設定類別

In [63]:
s = pd.Series(['one', 'two', 'four', 'five'], dtype = 'category')
s

0     one
1     two
2    four
3    five
dtype: category
Categories (4, object): ['five', 'four', 'one', 'two']

In [64]:
s = s.cat.set_categories(['one', 'four'])
s

0     one
1     NaN
2    four
3     NaN
dtype: category
Categories (2, object): ['one', 'four']

## 類別物件的敘述性資訊

In [65]:
metals

['bronze', 'gold', 'silver', 'bronze']
Categories (3, object): ['bronze' < 'silver' < 'gold']

In [66]:
metals.describe() # 類別物件的敘述性資訊

Unnamed: 0_level_0,counts,freqs
categories,Unnamed: 1_level_1,Unnamed: 2_level_1
bronze,2,0.5
silver,1,0.25
gold,1,0.25


In [67]:
# 若是有 0 的桶子也會被顯示
metals.value_counts()

bronze    2
silver    1
gold      1
dtype: int64

In [68]:
metals.min()

'bronze'

In [69]:
metals.max()

'gold'

In [70]:
metals.mode()

['bronze']
Categories (3, object): ['bronze' < 'silver' < 'gold']

## 學校成績轉換

In [71]:
np.random.seed(123456)
names = ['Ivana', 'Norris', 'Ruth', 'Lane', 'Skye', 'Sol', 'Dylan', 'Katina', 'Alissa', "Marc"]
grades = np.random.randint(50, 101, len(names))
scores = pd.DataFrame({'Name': names, 'Grade': grades})
scores

Unnamed: 0,Name,Grade
0,Ivana,51
1,Norris,92
2,Ruth,100
3,Lane,99
4,Skye,93
5,Sol,97
6,Dylan,93
7,Katina,77
8,Alissa,82
9,Marc,73


In [72]:
score_bins = [ 0,  59,   62,  66,   69,   72,  76,   79,   82, 86,   89,   92,  99, 100]
letter_grades = ['F', 'D-', 'D', 'D+', 'C-', 'C', 'C+', 'B-', 'B', 'B+', 'A-', 'A', 'A+']

In [73]:
letter_cats = pd.cut(scores.Grade, bins = score_bins, labels = letter_grades)
scores['Letter'] = letter_cats
scores

Unnamed: 0,Name,Grade,Letter
0,Ivana,51,F
1,Norris,92,A-
2,Ruth,100,A+
3,Lane,99,A
4,Skye,93,A
5,Sol,97,A
6,Dylan,93,A
7,Katina,77,C+
8,Alissa,82,B-
9,Marc,73,C


In [74]:
scores.Letter

0     F
1    A-
2    A+
3     A
4     A
5     A
6     A
7    C+
8    B-
9     C
Name: Letter, dtype: category
Categories (13, object): ['F' < 'D-' < 'D' < 'D+' ... 'B+' < 'A-' < 'A' < 'A+']

In [75]:
# 這是序列的 .describe() 方法，跟類別物件的 .describe() 方法呈現的稍有不同
scores.Letter.describe()

count     10
unique     7
top        A
freq       4
Name: Letter, dtype: object

In [76]:
# 沒出現的類別也會顯示，顯示 0
scores.Letter.value_counts()

A     4
F     1
C     1
C+    1
B-    1
A-    1
A+    1
D-    0
D     0
D+    0
C-    0
B     0
B+    0
Name: Letter, dtype: int64

In [77]:
scores.sort_values(by = ['Letter'], ascending = False )

Unnamed: 0,Name,Grade,Letter
2,Ruth,100,A+
3,Lane,99,A
4,Skye,93,A
5,Sol,97,A
6,Dylan,93,A
1,Norris,92,A-
8,Alissa,82,B-
7,Katina,77,C+
9,Marc,73,C
0,Ivana,51,F


## 綜合應用

In [78]:
fruits = ['apple', 'orange', 'apple', 'apple']*2
N = len(fruits)
df = pd.DataFrame({'fruit': fruits,
                   'basket_id': np.arange(N),
                   'count': np.random.randint(3, 15, size = N),
                   'weight': np.random.uniform(0, 4, size = N)},
                   columns = ['basket_id', 'fruit', 'count', 'weight']) # 這裡的 columns 決定了順序
df

Unnamed: 0,basket_id,fruit,count,weight
0,0,apple,7,1.541779
1,1,orange,11,3.842247
2,2,apple,13,3.769841
3,3,apple,13,1.410969
4,4,apple,14,0.702184
5,5,orange,7,0.635228
6,6,apple,14,3.816366
7,7,apple,5,0.561859


In [79]:
fruit_cat = df['fruit'].astype('category')
fruit_cat

0     apple
1    orange
2     apple
3     apple
4     apple
5    orange
6     apple
7     apple
Name: fruit, dtype: category
Categories (2, object): ['apple', 'orange']

In [80]:
c = fruit_cat.values # 值不是 NumPy 陣列，而是 pandas.Categorial 實例
type(c)

pandas.core.arrays.categorical.Categorical

In [81]:
c.categories

Index(['apple', 'orange'], dtype='object')

In [82]:
c.codes

array([0, 1, 0, 0, 0, 1, 0, 0], dtype=int8)

In [83]:
# 看是哪個資料型態的 .take() 方法，來得知返回的資料型態
c.categories.take(c.codes)

Index(['apple', 'orange', 'apple', 'apple', 'apple', 'orange', 'apple',
       'apple'],
      dtype='object')

---

In [84]:
np.random.seed(12345)
draws = np.random.randn(1000)
draws[:5]

array([-0.20470766,  0.47894334, -0.51943872, -0.5557303 ,  1.96578057])

In [85]:
bins = pd.qcut(draws, q = 4)
bins

[(-0.684, -0.0101], (-0.0101, 0.63], (-0.684, -0.0101], (-0.684, -0.0101], (0.63, 3.928], ..., (-0.0101, 0.63], (-0.684, -0.0101], (-2.9499999999999997, -0.684], (-0.0101, 0.63], (0.63, 3.928]]
Length: 1000
Categories (4, interval[float64, right]): [(-2.9499999999999997, -0.684] < (-0.684, -0.0101] < (-0.0101, 0.63] < (0.63, 3.928]]

In [86]:
bins.describe()

Unnamed: 0_level_0,counts,freqs
categories,Unnamed: 1_level_1,Unnamed: 2_level_1
"(-2.9499999999999997, -0.684]",250,0.25
"(-0.684, -0.0101]",250,0.25
"(-0.0101, 0.63]",250,0.25
"(0.63, 3.928]",250,0.25


In [87]:
bins = pd.qcut(draws, 4, labels = ['Q1', 'Q2', 'Q3', 'Q4'])
bins

['Q2', 'Q3', 'Q2', 'Q2', 'Q4', ..., 'Q3', 'Q2', 'Q1', 'Q3', 'Q4']
Length: 1000
Categories (4, object): ['Q1' < 'Q2' < 'Q3' < 'Q4']

In [88]:
bins.describe()

Unnamed: 0_level_0,counts,freqs
categories,Unnamed: 1_level_1,Unnamed: 2_level_1
Q1,250,0.25
Q2,250,0.25
Q3,250,0.25
Q4,250,0.25


In [89]:
bins.codes[:10]

array([1, 2, 1, 1, 3, 3, 2, 2, 3, 3], dtype=int8)

In [90]:
bins = pd.Series(bins, name = 'quantile')
bins

0      Q2
1      Q3
2      Q2
3      Q2
4      Q4
       ..
995    Q3
996    Q2
997    Q1
998    Q3
999    Q4
Name: quantile, Length: 1000, dtype: category
Categories (4, object): ['Q1' < 'Q2' < 'Q3' < 'Q4']

In [91]:
pd.Series(draws)

0     -0.204708
1      0.478943
2     -0.519439
3     -0.555730
4      1.965781
         ...   
995    0.107657
996   -0.139298
997   -1.159926
998    0.618965
999    1.373890
Length: 1000, dtype: float64

In [92]:
results = pd.Series(draws).groupby(bins).agg(['count', 'min', 'max'])
results

Unnamed: 0_level_0,count,min,max
quantile,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Q1,250,-2.949343,-0.685484
Q2,250,-0.683066,-0.010115
Q3,250,-0.010032,0.628894
Q4,250,0.634238,3.927528


In [93]:
results = results.reset_index()
results

Unnamed: 0,quantile,count,min,max
0,Q1,250,-2.949343,-0.685484
1,Q2,250,-0.683066,-0.010115
2,Q3,250,-0.010032,0.628894
3,Q4,250,0.634238,3.927528


In [94]:
results['quantile']

0    Q1
1    Q2
2    Q3
3    Q4
Name: quantile, dtype: category
Categories (4, object): ['Q1' < 'Q2' < 'Q3' < 'Q4']

---

In [95]:
s_cat = pd.Series(['a', 'b', 'c', 'd']*2, dtype = 'category')
s_cat

0    a
1    b
2    c
3    d
4    a
5    b
6    c
7    d
dtype: category
Categories (4, object): ['a', 'b', 'c', 'd']

In [96]:
s_cat.cat.codes

0    0
1    1
2    2
3    3
4    0
5    1
6    2
7    3
dtype: int8

In [97]:
# 為建模而建立 dummy 變數
# one-hot 編碼
# 可以跟 s_cat.cat.codes 做比較
pd.get_dummies(s_cat) 

Unnamed: 0,a,b,c,d
0,1,0,0,0
1,0,1,0,0
2,0,0,1,0
3,0,0,0,1
4,1,0,0,0
5,0,1,0,0
6,0,0,1,0
7,0,0,0,1
