In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# How to handle categorical data?

- String/text data values

- Categorical data:
        
    - __Ordinal data__: Categorical data yang dapat diwakili oleh angka
        
        _contoh_: ranking, level, degree, jabatan
        
        _teknik:_ __Labelling__, misal ```S1 = 0``` dan ```S2 = 1```
        
    - __Nominal data__: Categorical data tidak dapat diwakili oleh angka
        
        _contoh:_ nama, gender, warna
        
        _teknik:_ __Dummy Variables__ & __One Hot Encoder__

<hr>

### Labeling Without SKlearn

In [2]:
df = pd.DataFrame([
    {'no':'1', 'nama':'Andi', 'jabatan':'Staff','gaji': 8000000},
    {'no':'2', 'nama':'Budi', 'jabatan':'Staff','gaji': 8000000},
    {'no':'3', 'nama':'Caca', 'jabatan':'Head','gaji': 20000000},
    {'no':'4', 'nama':'Deni', 'jabatan':'Head','gaji': 20000000},
    {'no':'5', 'nama':'Euis', 'jabatan':'Manager','gaji': 55000000}
])
df

Unnamed: 0,no,nama,jabatan,gaji
0,1,Andi,Staff,8000000
1,2,Budi,Staff,8000000
2,3,Caca,Head,20000000
3,4,Deni,Head,20000000
4,5,Euis,Manager,55000000


In [3]:
# Labelling: staff 1, head 2, manager 3
dfA = df.copy()
dfA['labeljabatan']= dfA['jabatan'].apply(
    lambda x : 1 if x == 'Staff' else (2 if x == 'Head' else 3 )
)
dfA

Unnamed: 0,no,nama,jabatan,gaji,labeljabatan
0,1,Andi,Staff,8000000,1
1,2,Budi,Staff,8000000,1
2,3,Caca,Head,20000000,2
3,4,Deni,Head,20000000,2
4,5,Euis,Manager,55000000,3


<hr>

### 2a. Labelling with SKlearn (```LabelEncoder()```)
    - Categorical data yang dapat diwakili dengan angka : ranking, degree, level
    - ```LabelEncode()``` sebaiknya digunakan untuk labelling data target y terutama pada kasus klasifikasi

In [4]:
dfB = df.copy()

In [5]:
from sklearn.preprocessing import LabelEncoder

In [6]:
label = LabelEncoder()

In [7]:
label.fit(df['jabatan']) # create transformer

LabelEncoder()

In [8]:
# cek hasil labelling
print(label.transform(df['jabatan']))
print(df['jabatan'].values.tolist())

# cek urutan label
print(label.classes_)


[2 2 0 0 1]
['Staff', 'Staff', 'Head', 'Head', 'Manager']
['Head' 'Manager' 'Staff']


In [9]:
# inverse transform
print(label.inverse_transform([0,1,2]))
print(label.inverse_transform([2,1,2,0]))

['Head' 'Manager' 'Staff']
['Staff' 'Manager' 'Staff' 'Head']


In [10]:
# create label + transform
label.fit_transform(df['jabatan'])

array([2, 2, 0, 0, 1])

In [11]:
# contoh kasus yang cocok untuk labelEncoder : klasifikasi/clustering labelling pada y
dfSp = pd.DataFrame([
    {'tinggi' : 500, 'berat' : 175, 'spesies' : 'Jerapah'},
    {'tinggi' : 520, 'berat' : 200, 'spesies' : 'Jerapah'},
    {'tinggi' : 460, 'berat' : 180, 'spesies' : 'Jerapah'},
    {'tinggi' : 25, 'berat' : 4, 'spesies' : 'Kucing'},
    {'tinggi' : 20, 'berat' : 3, 'spesies' : 'Kucing'},
    {'tinggi' : 21, 'berat' : 5, 'spesies' : 'Kucing'}
])
dfSp

Unnamed: 0,tinggi,berat,spesies
0,500,175,Jerapah
1,520,200,Jerapah
2,460,180,Jerapah
3,25,4,Kucing
4,20,3,Kucing
5,21,5,Kucing


In [12]:
labelSp = LabelEncoder()
print(labelSp.fit_transform(dfSp['spesies']))
print(labelSp.inverse_transform(labelSp.fit_transform(dfSp['spesies'])))

[0 0 0 1 1 1]
['Jerapah' 'Jerapah' 'Jerapah' 'Kucing' 'Kucing' 'Kucing']


<hr>

### 2b. Labelling with SKlearn(```OrdinalEncoder()```)

In [13]:
dfC = df.copy()

In [14]:
from sklearn.preprocessing import OrdinalEncoder

In [15]:
labelOE = OrdinalEncoder(categories=[['Staff','Head','Manager']])
labelOE.fit(dfC[['jabatan']]) # 2 dimensi

OrdinalEncoder(categories=[['Staff', 'Head', 'Manager']],
               dtype=<class 'numpy.float64'>)

In [16]:
print(labelOE.categories_)
print(labelOE.transform(dfC[['jabatan']]))

[array(['Staff', 'Head', 'Manager'], dtype=object)]
[[0.]
 [0.]
 [1.]
 [1.]
 [2.]]


In [17]:
print(labelOE.inverse_transform(np.array([1, 2, 2, 1]).reshape(-1,1)))

[['Head']
 ['Manager']
 ['Manager']
 ['Head']]


<hr>

### 3a. Dummy Variables
    - Categorical data tidak dapat diwakili dengan angka : nama, gender, spesies, jenis, warna

In [18]:
dfDv = pd.DataFrame([
    {'luas': 50, 'kota': 'Jakarta', 'harga':500},
    {'luas': 100, 'kota': 'Jakarta', 'harga':1000},
    {'luas': 150, 'kota': 'Jakarta', 'harga':1500},
    {'luas': 50, 'kota': 'Yogyakarta', 'harga':200},
    {'luas': 100, 'kota': 'Yogyakarta', 'harga':400},
    {'luas': 150, 'kota': 'Yogyakarta', 'harga':600}
])

In [19]:
dfDv.corr() # kota akan diabaikan karena bukan angka

Unnamed: 0,luas,harga
luas,1.0,0.661438
harga,0.661438,1.0


In [20]:
dfDummy = pd.get_dummies(dfDv['kota'])
dfDummy

Unnamed: 0,Jakarta,Yogyakarta
0,1,0
1,1,0
2,1,0
3,0,1
4,0,1
5,0,1


In [21]:
df= pd.concat([dfDv,dfDummy], axis='columns')
df

Unnamed: 0,luas,kota,harga,Jakarta,Yogyakarta
0,50,Jakarta,500,1,0
1,100,Jakarta,1000,1,0
2,150,Jakarta,1500,1,0
3,50,Yogyakarta,200,0,1
4,100,Yogyakarta,400,0,1
5,150,Yogyakarta,600,0,1


In [22]:
from sklearn.linear_model import LinearRegression

In [23]:
modelDv = LinearRegression()

In [24]:
modelDv.fit(df[['luas','Jakarta','Yogyakarta']],df['harga'])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [25]:
df['hargapred'] = modelDv.predict(df[['luas','Jakarta','Yogyakarta']])

In [26]:
df

Unnamed: 0,luas,kota,harga,Jakarta,Yogyakarta,hargapred
0,50,Jakarta,500,1,0,650.0
1,100,Jakarta,1000,1,0,1000.0
2,150,Jakarta,1500,1,0,1350.0
3,50,Yogyakarta,200,0,1,50.0
4,100,Yogyakarta,400,0,1,400.0
5,150,Yogyakarta,600,0,1,750.0


<hr>

### 3b. One Hot Encoding
    - Categorical data tidak dapat diwakili dengan angka : nama, gender, spesies, jenis, warna
    - Teknik One Hot Encoding : diawali dengan labelling

In [27]:
dfOhe = pd.DataFrame([
    {'luas': 50, 'kota': 'Jakarta', 'grade' : 'A', 'harga':500},
    {'luas': 100, 'kota': 'Jakarta', 'grade' : 'B', 'harga':1000},
    {'luas': 150, 'kota': 'Jakarta', 'grade' : 'C', 'harga':1500},
    {'luas': 50, 'kota': 'Yogyakarta', 'grade' : 'A', 'harga':200},
    {'luas': 100, 'kota': 'Yogyakarta', 'grade' : 'B', 'harga':400},
    {'luas': 150, 'kota': 'Yogyakarta', 'grade' : 'C', 'harga':600}
])

#### a. Labelling

In [28]:

labelOhe = OrdinalEncoder()
labelOhe.fit(dfOhe[['kota', 'grade']])

OrdinalEncoder(categories='auto', dtype=<class 'numpy.float64'>)

In [29]:
labelOhe.transform(dfOhe[['kota', 'grade']])

array([[0., 0.],
       [0., 1.],
       [0., 2.],
       [1., 0.],
       [1., 1.],
       [1., 2.]])

In [30]:
dfLabel = pd.DataFrame(
    labelOhe.transform(dfOhe[['kota', 'grade']]),
    columns = ['labelkota','labelgrade']
)
dfLabel

Unnamed: 0,labelkota,labelgrade
0,0.0,0.0
1,0.0,1.0
2,0.0,2.0
3,1.0,0.0
4,1.0,1.0
5,1.0,2.0


In [31]:
df = pd.concat([dfOhe,dfLabel],axis = 1)
df

Unnamed: 0,luas,kota,grade,harga,labelkota,labelgrade
0,50,Jakarta,A,500,0.0,0.0
1,100,Jakarta,B,1000,0.0,1.0
2,150,Jakarta,C,1500,0.0,2.0
3,50,Yogyakarta,A,200,1.0,0.0
4,100,Yogyakarta,B,400,1.0,1.0
5,150,Yogyakarta,C,600,1.0,2.0


#### b. One Hot Encoder
    - Mirip seperti dummy variables
    - better pisahkan feature x dan target y

In [32]:
# pisahkan feature x dan target y
x = df[['luas','labelkota','labelgrade']]
y = df['harga']

In [33]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [34]:
# tanpa remainder passthrough
coltrans = ColumnTransformer(
    [('one_hot_encoder', OneHotEncoder(),[1])], # OHE pada labelkota = index 1 di x
)

In [35]:
xA = coltrans.fit_transform(x)
xA

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.]])

In [36]:
# dengan remainder passthrough
coltrans = ColumnTransformer(
    [('one_hot_encoder', OneHotEncoder(),[1])], # OHE pada labelkota = index 1 di x
    remainder = 'passthrough'
)

In [37]:
xB = coltrans.fit_transform(x)
xB

array([[  1.,   0.,  50.,   0.],
       [  1.,   0., 100.,   1.],
       [  1.,   0., 150.,   2.],
       [  0.,   1.,  50.,   0.],
       [  0.,   1., 100.,   1.],
       [  0.,   1., 150.,   2.]])

In [38]:
model = LinearRegression()
model.fit(xB, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [39]:
df['harga"'] = model.predict(xB)
df

Unnamed: 0,luas,kota,grade,harga,labelkota,labelgrade,"harga"""
0,50,Jakarta,A,500,0.0,0.0,650.0
1,100,Jakarta,B,1000,0.0,1.0,1000.0
2,150,Jakarta,C,1500,0.0,2.0,1350.0
3,50,Yogyakarta,A,200,1.0,0.0,50.0
4,100,Yogyakarta,B,400,1.0,1.0,400.0
5,150,Yogyakarta,C,600,1.0,2.0,750.0
