In [2]:
import pandas as pd
import numpy as np
import joblib

In [3]:
data = joblib.load('output/data_irregularities.pkl')

In [4]:
data.head()

Unnamed: 0,id,detection_date_millis,update_date_millis,street,city,is_highway,line,s2id_center,s2token_center,speed,...,delay_seconds,seconds,length,trend,type,severity,jam_level,drivers_count,alerts_count,n_thumbs_up
0,12868069,1604733149024,1604735467276,Jatiwaringin Raya,Bekasi,t,"{""line"": [{""x"": 106.91014, ""y"": -6.258107}, {""...",3344471185277583360,2e69f2d2c,13.03,...,299,432,1566,0,Small,5,3,13,0,0
1,12420463,1599906813144,1599909295834,Putri Tunggal,Depok,f,"{""line"": [{""x"": 106.887821, ""y"": -6.377016}, {...",3344462996922433536,2e69eb604,6.56,...,399,539,984,0,Small,5,3,5,0,0
2,12497533,1601728355356,1601734996933,Ir Haji Juanda,Bandung,f,"{""line"": [{""x"": 107.618629, ""y"": -6.87556}, {""...",3344176694402482176,2e68e6fc4,3.36,...,1185,1294,1212,1,Large,5,4,21,2,0
3,12536831,1602312860279,1602315706305,KH Muchtar Tabrani,Bekasi,f,"{""line"": [{""x"": 107.002934, ""y"": -6.216088}, {...",3344358143885836288,2e698c034,4.36,...,467,543,659,-1,Small,5,4,3,0,0
4,12327151,1598956623240,1598957378934,N1 Pangeran Diponegoro,Tambun Selatan,t,"{""line"": [{""x"": 107.035652, ""y"": -6.255471}, {...",3344360723013697536,2e698e5bc,4.74,...,423,474,625,0,Small,5,4,11,0,0


### Split Numerical and Categorical Columns 

In [5]:
NUMERICAL_COLUMNS = ['id',
                    'detection_date_millis',
                    'update_date_millis',
                    's2id_center',
                    'speed',
                    'regular_speed',
                    'delay_seconds',
                    'seconds',
                    'length',
                    'trend',
                    'severity',
                    'jam_level',
                    'drivers_count',
                    'alerts_count',
                    'n_thumbs_up']

In [6]:
CATEGORICAL_COLUMNS = ['street',
                      'city',
                      'is_highway',
                      'line',
                      's2token_center',
                      'type']

In [7]:
def separate_dtype(dataset,
                  NUMERICAL_COLUMNS,
                  CATEGORICAL_COLUMNS):
    
    numerical = dataset[NUMERICAL_COLUMNS].copy()
    categorical = dataset[CATEGORICAL_COLUMNS].copy()
    
    return numerical, categorical

In [8]:
num_data, cat_data = separate_dtype(data,
                                    NUMERICAL_COLUMNS,
                                    CATEGORICAL_COLUMNS)

In [9]:
num_data

Unnamed: 0,id,detection_date_millis,update_date_millis,s2id_center,speed,regular_speed,delay_seconds,seconds,length,trend,severity,jam_level,drivers_count,alerts_count,n_thumbs_up
0,12868069,1604733149024,1604735467276,3344471185277583360,13.03,17.15,299,432,1566,0,5,3,13,0,0
1,12420463,1599906813144,1599909295834,3344462996922433536,6.56,18.00,399,539,984,0,5,3,5,0,0
2,12497533,1601728355356,1601734996933,3344176694402482176,3.36,19.65,1185,1294,1212,1,5,4,21,2,0
3,12536831,1602312860279,1602315706305,3344358143885836288,4.36,8.04,467,543,659,-1,5,4,3,0,0
4,12327151,1598956623240,1598957378934,3344360723013697536,4.74,16.71,423,474,625,0,5,4,11,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
352604,12362039,1599286843171,1599294437634,3344419823911174144,4.57,11.10,677,792,1006,0,5,4,12,0,0
352605,12592138,1603007434857,1603007969509,3344177896993325056,4.86,26.33,814,915,1237,0,5,4,4,1,0
352606,12823298,1604248226105,1604249587644,3344350307718004736,3.40,18.30,871,983,929,0,5,4,2,0,0
352607,12775093,1604115788906,1604117576019,3344176846873821184,9.67,15.26,676,913,2454,0,5,3,31,0,0


In [10]:
cat_data

Unnamed: 0,street,city,is_highway,line,s2token_center,type
0,Jatiwaringin Raya,Bekasi,t,"{""line"": [{""x"": 106.91014, ""y"": -6.258107}, {""...",2e69f2d2c,Small
1,Putri Tunggal,Depok,f,"{""line"": [{""x"": 106.887821, ""y"": -6.377016}, {...",2e69eb604,Small
2,Ir Haji Juanda,Bandung,f,"{""line"": [{""x"": 107.618629, ""y"": -6.87556}, {""...",2e68e6fc4,Large
3,KH Muchtar Tabrani,Bekasi,f,"{""line"": [{""x"": 107.002934, ""y"": -6.216088}, {...",2e698c034,Small
4,N1 Pangeran Diponegoro,Tambun Selatan,t,"{""line"": [{""x"": 107.035652, ""y"": -6.255471}, {...",2e698e5bc,Small
...,...,...,...,...,...,...
352604,N8 Jalan Raya Bogor,Bogor,t,"{""line"": [{""x"": 106.815835, ""y"": -6.556333}, {...",2e69c41c4,Small
352605,Soekarno-Hatta (Jalur Lambat),Bandung,t,"{""line"": [{""x"": 107.654988, ""y"": -6.941372}, {...",2e68e8144,Large
352606,N1 RE Martadinata,Cikarang,t,"{""line"": [{""x"": 107.155055, ""y"": -6.257546}, {...",2e6984e2c,Large
352607,Ir Haji Juanda,Bandung,f,"{""line"": [{""x"": 107.616416, ""y"": -6.879372}, {...",2e68e71fc,Small


### Check Null Value 

In [11]:
num_data.isnull().sum()

id                       0
detection_date_millis    0
update_date_millis       0
s2id_center              0
speed                    0
regular_speed            0
delay_seconds            0
seconds                  0
length                   0
trend                    0
severity                 0
jam_level                0
drivers_count            0
alerts_count             0
n_thumbs_up              0
dtype: int64

In [12]:
cat_data.isnull().sum()

street            2836
city                 0
is_highway           0
line                 0
s2token_center       0
type                 0
dtype: int64

### Impute Categorical Data

In [13]:
def impute_categorical_transform(categorical_data):
    categorical_data = categorical_data.fillna('KOSONG')
    return categorical_data

In [14]:
cat_data_imputted = impute_categorical_transform(cat_data)

In [15]:
cat_data_imputted.isnull().sum()

street            0
city              0
is_highway        0
line              0
s2token_center    0
type              0
dtype: int64