In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# How to handle categorical data?

- String/text data values

- Categorical data:
        
    - __Ordinal data__: Categorical data yang dapat diwakili oleh angka
        
        _contoh_: ranking, level, degree, jabatan
        
        _teknik:_ __Labelling__, misal ```S1 = 0``` dan ```S2 = 1```
        
    - __Nominal data__: Categorical data tidak dapat diwakili oleh angka
        
        _contoh:_ nama, gender, warna
        
        _teknik:_ __Dummy Variables__ & __One Hot Encoder__

<hr>

### 1a. Labelling without Sklearn

- Categorical data yang dapat diwakili dengan angka: rangking, degree, level

In [41]:
df = pd.DataFrame([
    {'no': 1, 'nama': 'Andi', 'jabatan': 'Staff', 'gaji': 8000000},
    {'no': 2, 'nama': 'Budi', 'jabatan': 'Staff', 'gaji': 8000000},
    {'no': 3, 'nama': 'Caca', 'jabatan': 'Head', 'gaji': 20000000},
    {'no': 4, 'nama': 'Deni', 'jabatan': 'Head', 'gaji': 20000000},
    {'no': 5, 'nama': 'Euis', 'jabatan': 'Manager', 'gaji': 55000000},
])
df

Unnamed: 0,no,nama,jabatan,gaji
0,1,Andi,Staff,8000000
1,2,Budi,Staff,8000000
2,3,Caca,Head,20000000
3,4,Deni,Head,20000000
4,5,Euis,Manager,55000000


In [15]:
dfA = df.copy()
dfA['jabatan'].unique().tolist()

['Staff', 'Head', 'Manager']

In [16]:
# labelling: staff 1, head 2, manager 3
dfA['labelJabatan'] = dfA['jabatan'].apply(
    lambda x: 1 if x == 'Staff' else (2 if x == 'Head' else 3)
)
dfA

Unnamed: 0,no,nama,jabatan,gaji,labelJabatan
0,1,Andi,Staff,8000000,1
1,2,Budi,Staff,8000000,1
2,3,Caca,Head,20000000,2
3,4,Deni,Head,20000000,2
4,5,Euis,Manager,55000000,3


<hr>

### 1b. Labelling with Sklearn ```LabelEncoder()```

- Categorical data yang dapat diwakili dengan angka: rangking, degree, level.
- ```LabelEncoder()``` sebaiknya digunakan untuk labelling data target y, terutama pada kasus  klasifikasi.
- ```LabelEncoder()``` melakukan labelling sesuai urutan alphabetis.

In [21]:
dfB = df.copy()
dfB

Unnamed: 0,no,nama,jabatan,gaji
0,1,Andi,Staff,8000000
1,2,Budi,Staff,8000000
2,3,Caca,Head,20000000
3,4,Deni,Head,20000000
4,5,Euis,Manager,55000000


In [20]:
dfB['jabatan'].unique().tolist()

['Staff', 'Head', 'Manager']

In [22]:
from sklearn.preprocessing import LabelEncoder

In [23]:
label = LabelEncoder()

In [24]:
label.fit(df['jabatan']) # create transformer

LabelEncoder()

In [31]:
# cek hasil labelling:
print(label.transform(df['jabatan']))
print(df['jabatan'].values.tolist())

# cek urutan label:
print(label.classes_)

[2 2 0 0 1]
['Staff', 'Staff', 'Head', 'Head', 'Manager']
['Head' 'Manager' 'Staff']


In [32]:
# inverse transform:
print(label.inverse_transform([0, 1, 2]))
print(label.inverse_transform([2, 1, 2, 0]))

['Head' 'Manager' 'Staff']
['Staff' 'Manager' 'Staff' 'Head']


In [33]:
# create label + transformer
label.fit_transform(df['jabatan'])

array([2, 2, 0, 0, 1])

In [35]:
# contoh kasus yg cocok untuk LabelEncoder: klasifikasi/clustering labelling pada y
dfSp = pd.DataFrame([
    {'tinggi': 500, 'berat': 175, 'spesies': 'Jerapah'},
    {'tinggi': 520, 'berat': 200, 'spesies': 'Jerapah'},
    {'tinggi': 460, 'berat': 180, 'spesies': 'Jerapah'},
    {'tinggi': 25, 'berat': 4, 'spesies': 'Kucing'},
    {'tinggi': 20, 'berat': 3, 'spesies': 'Kucing'},
    {'tinggi': 21, 'berat': 5, 'spesies': 'Kucing'}
])
dfSp

Unnamed: 0,tinggi,berat,spesies
0,500,175,Jerapah
1,520,200,Jerapah
2,460,180,Jerapah
3,25,4,Kucing
4,20,3,Kucing
5,21,5,Kucing


In [38]:
labelSp = LabelEncoder()
print(labelSp.fit_transform(dfSp['spesies']))
print(labelSp.inverse_transform(labelSp.fit_transform(dfSp['spesies'])))

[0 0 0 1 1 1]
['Jerapah' 'Jerapah' 'Jerapah' 'Kucing' 'Kucing' 'Kucing']


<hr>

### 1c. Labelling with Sklearn ```OrdinalEncoder()```

- Categorical data yang dapat diwakili dengan angka: rangking, degree, level.
- ```OrdinalEncoder()``` parameternya value 2D, urutan kategori dapat diatur.

In [42]:
dfC = df.copy()
dfC

Unnamed: 0,no,nama,jabatan,gaji
0,1,Andi,Staff,8000000
1,2,Budi,Staff,8000000
2,3,Caca,Head,20000000
3,4,Deni,Head,20000000
4,5,Euis,Manager,55000000


In [43]:
from sklearn.preprocessing import OrdinalEncoder

In [70]:
# labelling dengan urutan kategori: 'Staff', 'Head', 'Manager'
# defaultnya urutan kategori alfabetis

labelOE = OrdinalEncoder(categories=[['Staff', 'Head', 'Manager']])
labelOE.fit(dfC[['jabatan']])

OrdinalEncoder(categories=[['Staff', 'Head', 'Manager']],
               dtype=<class 'numpy.float64'>)

In [71]:
print(labelOE.categories_)
print(labelOE.transform(dfC[['jabatan']]))

[array(['Staff', 'Head', 'Manager'], dtype=object)]
[[0.]
 [0.]
 [1.]
 [1.]
 [2.]]


In [72]:
print(labelOE.inverse_transform(np.array([1, 2, 2, 1]).reshape(-1, 1)))

[['Head']
 ['Manager']
 ['Manager']
 ['Head']]


<hr>

### 2a. Dummy Variables

- Categorical data tidak dapat diwakili dengan angka: nama, gender, spesies, jenis, warna

In [74]:
df = pd.DataFrame([
    {'luas': 50, 'kota': 'Jakarta', 'harga': 500},
    {'luas': 100, 'kota': 'Jakarta', 'harga': 1000},
    {'luas': 150, 'kota': 'Jakarta', 'harga': 1500},
    {'luas': 50, 'kota': 'Yogyakarta', 'harga': 200},
    {'luas': 100, 'kota': 'Yogyakarta', 'harga': 400},
    {'luas': 150, 'kota': 'Yogyakarta', 'harga': 600},
])
df

Unnamed: 0,luas,kota,harga
0,50,Jakarta,500
1,100,Jakarta,1000
2,150,Jakarta,1500
3,50,Yogyakarta,200
4,100,Yogyakarta,400
5,150,Yogyakarta,600


In [75]:
df.corr()

Unnamed: 0,luas,harga
luas,1.0,0.661438
harga,0.661438,1.0


In [76]:
dfDummy = pd.get_dummies(df['kota'])
dfDummy

Unnamed: 0,Jakarta,Yogyakarta
0,1,0
1,1,0
2,1,0
3,0,1
4,0,1
5,0,1


In [78]:
df = pd.concat([df, dfDummy], axis='columns')
df

Unnamed: 0,luas,kota,harga,Jakarta,Yogyakarta
0,50,Jakarta,500,1,0
1,100,Jakarta,1000,1,0
2,150,Jakarta,1500,1,0
3,50,Yogyakarta,200,0,1
4,100,Yogyakarta,400,0,1
5,150,Yogyakarta,600,0,1


In [79]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [80]:
model.fit(df[['luas', 'Jakarta', 'Yogyakarta']], df['harga'])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [81]:
df['harga"'] = model.predict(df[['luas', 'Jakarta', 'Yogyakarta']])
df

Unnamed: 0,luas,kota,harga,Jakarta,Yogyakarta,"harga"""
0,50,Jakarta,500,1,0,650.0
1,100,Jakarta,1000,1,0,1000.0
2,150,Jakarta,1500,1,0,1350.0
3,50,Yogyakarta,200,0,1,50.0
4,100,Yogyakarta,400,0,1,400.0
5,150,Yogyakarta,600,0,1,750.0


In [84]:
# Dummy pada beberapa feature
df = pd.DataFrame([
    {'luas': 50, 'kota': 'Jakarta', 'grade': 'A', 'harga': 500},
    {'luas': 100, 'kota': 'Jakarta', 'grade': 'B', 'harga': 1000},
    {'luas': 150, 'kota': 'Jakarta', 'grade': 'C', 'harga': 1500},
    {'luas': 50, 'kota': 'Yogyakarta', 'grade': 'A', 'harga': 200},
    {'luas': 100, 'kota': 'Yogyakarta', 'grade': 'B', 'harga': 400},
    {'luas': 150, 'kota': 'Yogyakarta', 'grade': 'C', 'harga': 600},
])
df.head(1)

Unnamed: 0,luas,kota,grade,harga
0,50,Jakarta,A,500


In [83]:
dfDummy = pd.get_dummies(df[['kota', 'grade']])
dfDummy

Unnamed: 0,kota_Jakarta,kota_Yogyakarta,grade_A,grade_B,grade_C
0,1,0,1,0,0
1,1,0,0,1,0
2,1,0,0,0,1
3,0,1,1,0,0
4,0,1,0,1,0
5,0,1,0,0,1


<hr>

### 2b. One Hot Encoding

- Categorical data tidak dapat diwakili dengan angka: nama, gender, spesies, jenis, warna
- Teknik One Hot Encoding: diawali dengan labelling

In [86]:
# Dummy pada beberapa feature
df = pd.DataFrame([
    {'luas': 50, 'kota': 'Jakarta', 'grade': 'A', 'harga': 500},
    {'luas': 100, 'kota': 'Jakarta', 'grade': 'B', 'harga': 1000},
    {'luas': 150, 'kota': 'Jakarta', 'grade': 'C', 'harga': 1500},
    {'luas': 50, 'kota': 'Yogyakarta', 'grade': 'A', 'harga': 200},
    {'luas': 100, 'kota': 'Yogyakarta', 'grade': 'B', 'harga': 400},
    {'luas': 150, 'kota': 'Yogyakarta', 'grade': 'C', 'harga': 600},
])
df.head(1)

Unnamed: 0,luas,kota,grade,harga
0,50,Jakarta,A,500


#### A. Labelling

- Membuat data label dari feature berisi categorical data
- Bisa cara manual labelling, label encoder atau ordinal encoder

In [87]:
# 1. Labelling: kota & grade

labelOE = OrdinalEncoder()
labelOE.fit(df[['kota', 'grade']])

OrdinalEncoder(categories='auto', dtype=<class 'numpy.float64'>)

In [90]:
print(labelOE.categories_)
print(labelOE.transform(df[['kota', 'grade']]))

[array(['Jakarta', 'Yogyakarta'], dtype=object), array(['A', 'B', 'C'], dtype=object)]
[[0. 0.]
 [0. 1.]
 [0. 2.]
 [1. 0.]
 [1. 1.]
 [1. 2.]]


In [93]:
dfLabel = pd.DataFrame(
    labelOE.transform(df[['kota', 'grade']]),
    columns = ['labelKota', 'labelGrade']
)
dfLabel.head(1)

Unnamed: 0,labelKota,labelGrade
0,0.0,0.0


In [95]:
df = pd.concat([df, dfLabel], axis=1)
df

Unnamed: 0,luas,kota,grade,harga,labelKota,labelGrade
0,50,Jakarta,A,500,0.0,0.0
1,100,Jakarta,B,1000,0.0,1.0
2,150,Jakarta,C,1500,0.0,2.0
3,50,Yogyakarta,A,200,1.0,0.0
4,100,Yogyakarta,B,400,1.0,1.0
5,150,Yogyakarta,C,600,1.0,2.0


#### B. One Hot Encoder

- Mirip seperti dummy variables
- Better pisahkan antara feature x & target y

In [119]:
# pisahkan feature x & target y
x = df[['luas', 'labelKota', 'labelGrade']]
y = df['harga']

In [120]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [121]:
# tanpa remainder=passthrough
coltrans = ColumnTransformer(
    [('one_hot_encoder', OneHotEncoder(), [1])],   # OHE pada labelKota = index 1 di x
)

In [122]:
# hasil dari OHE feature labelKota
xA = coltrans.fit_transform(x)
xA

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.]])

In [123]:
# dengan remainder=passthrough
coltrans = ColumnTransformer(
    [('one_hot_encoder', OneHotEncoder(), [1])],   # OHE pada labelKota = index 1 di x
    remainder='passthrough'
)

In [124]:
# hasil dari OHE feature labelKota + seluruh data
xB = coltrans.fit_transform(x)
xB
# [ Jakarta  Yogyakarta  luas  grade ]

array([[  1.,   0.,  50.,   0.],
       [  1.,   0., 100.,   1.],
       [  1.,   0., 150.,   2.],
       [  0.,   1.,  50.,   0.],
       [  0.,   1., 100.,   1.],
       [  0.,   1., 150.,   2.]])

In [125]:
model = LinearRegression()
model.fit(xB, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [127]:
df['harga"'] = model.predict(xB)
df

Unnamed: 0,luas,kota,grade,harga,labelKota,labelGrade,"harga"""
0,50,Jakarta,A,500,0.0,0.0,650.0
1,100,Jakarta,B,1000,0.0,1.0,1000.0
2,150,Jakarta,C,1500,0.0,2.0,1350.0
3,50,Yogyakarta,A,200,1.0,0.0,50.0
4,100,Yogyakarta,B,400,1.0,1.0,400.0
5,150,Yogyakarta,C,600,1.0,2.0,750.0
