# Data Preprocessing

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import Binarizer, MinMaxScaler, normalize

### Sample Data 1

In [2]:
data = {
    "nama" : ["budi", "putri", "siska", "jaelani", "hasanuding"],
    "jk" : ["pria", "wanita", "wanita", "pria", "pria"]
}

dataFrame = pd.DataFrame(data)
dataFrame

Unnamed: 0,nama,jk
0,budi,pria
1,putri,wanita
2,siska,wanita
3,jaelani,pria
4,hasanuding,pria


### Merubah Data Jenis Kelamin ke Number

In [3]:
preprocessing = {
    "pria" : 0,
    "wanita" : 1
}

dataFrame.replace(preprocessing, inplace=True)
dataFrame

Unnamed: 0,nama,jk
0,budi,0
1,putri,1
2,siska,1
3,jaelani,0
4,hasanuding,0


### Sample Data 2

In [4]:

data = np.array([[-23,-32,12], [44,33,11], [56,34,67], [45,33,12],[45,54,23]])
data

array([[-23, -32,  12],
       [ 44,  33,  11],
       [ 56,  34,  67],
       [ 45,  33,  12],
       [ 45,  54,  23]])

### Merubah data ke Binary

In [5]:
preprocessor = Binarizer(threshold=0.5)
binary_data = preprocessor.transform(data)

print(preprocessor)
print(binary_data)

Binarizer(threshold=0.5)
[[0 0 1]
 [1 1 1]
 [1 1 1]
 [1 1 1]
 [1 1 1]]


### Scaling (Mengubah data ke scala terkecil)

In [6]:
data

array([[-23, -32,  12],
       [ 44,  33,  11],
       [ 56,  34,  67],
       [ 45,  33,  12],
       [ 45,  54,  23]])

In [7]:
preprocessor = MinMaxScaler(feature_range=(0,1))
preprocessor.fit(data)
scaled_data = preprocessor.transform(data)
scaled_data

array([[0.        , 0.        , 0.01785714],
       [0.84810127, 0.75581395, 0.        ],
       [1.        , 0.76744186, 1.        ],
       [0.86075949, 0.75581395, 0.01785714],
       [0.86075949, 1.        , 0.21428571]])

In [8]:
scaled_data = preprocessor.fit_transform(data)
scaled_data

array([[0.        , 0.        , 0.01785714],
       [0.84810127, 0.75581395, 0.        ],
       [1.        , 0.76744186, 1.        ],
       [0.86075949, 0.75581395, 0.01785714],
       [0.86075949, 1.        , 0.21428571]])

### L1 Normalisation: Least Absolute Deviations

In [9]:
data

array([[-23, -32,  12],
       [ 44,  33,  11],
       [ 56,  34,  67],
       [ 45,  33,  12],
       [ 45,  54,  23]])

In [10]:
l1_normalised_data = normalize(data, norm="l1")
l1_normalised_data

array([[-0.34328358, -0.47761194,  0.17910448],
       [ 0.5       ,  0.375     ,  0.125     ],
       [ 0.3566879 ,  0.21656051,  0.42675159],
       [ 0.5       ,  0.36666667,  0.13333333],
       [ 0.36885246,  0.44262295,  0.18852459]])

### L1 Normalisation: Least Squares

In [11]:
data

array([[-23, -32,  12],
       [ 44,  33,  11],
       [ 56,  34,  67],
       [ 45,  33,  12],
       [ 45,  54,  23]])

In [12]:
l2_normalised_data = normalize(data, norm="l2")
l2_normalised_data

array([[-0.55832479, -0.77679971,  0.29129989],
       [ 0.78446454,  0.58834841,  0.19611614],
       [ 0.5976075 ,  0.36283312,  0.71499469],
       [ 0.7883825 ,  0.57814716,  0.21023533],
       [ 0.60844153,  0.73012984,  0.31098123]])