In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', None)

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
import category_encoders as ce

# model selection & hyperparameter tuning
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, KFold

# model regression
from sklearn.ensemble import RandomForestRegressor
import catboost
from catboost import CatBoostRegressor

# metric regression
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

# hyperparameter tuning
import hyperopt
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials

import time
import warnings
warnings.filterwarnings('ignore')


**Get Stored Var**

In [2]:
%store -r df_clean
df_clean.head()

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,brand
0,A1,2017,12500,Manual,15735,Petrol,150.0,55.4,1.4,audi
1,A6,2016,16500,Automatic,36203,Diesel,20.0,64.2,2.0,audi
2,A1,2016,11000,Manual,29946,Petrol,30.0,55.4,1.4,audi
3,A4,2017,16800,Automatic,25952,Diesel,145.0,67.3,2.0,audi
4,A3,2019,17300,Manual,1998,Petrol,145.0,49.6,1.0,audi


In [3]:
%store -r transformer
transformer

ColumnTransformer(remainder='passthrough',
                  transformers=[('catboost encoder', CatBoostEncoder(),
                                 ['model', 'transmission', 'fuelType',
                                  'brand'])])

# Feature Engineering

Pada tahap EDA, kita sudah menganalisa dan mendapatkan beberapa tambahan informasi yang bisa kita jadikan feature baru, mereka adalah :
1. Audi, BMW dan Merc termasuk brand mobil mewah dan memiliki rentang harga mobil yang lebih tinggi dibanding brand lainnya
2. Terdapat 2 mobil antik dengan harga yang cukup tinggi 


In [4]:
df_clean['lux_brand'] = np.where((df_clean['brand'] == 'audi') | (df_clean['brand'] == 'bmw') | (df_clean['brand'] == 'merc'), 1, 0)
df_clean['antique'] = np.where(df_clean['year'] == 1970, 1, 0)
df_clean.head()

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,brand,lux_brand,antique
0,A1,2017,12500,Manual,15735,Petrol,150.0,55.4,1.4,audi,1,0
1,A6,2016,16500,Automatic,36203,Diesel,20.0,64.2,2.0,audi,1,0
2,A1,2016,11000,Manual,29946,Petrol,30.0,55.4,1.4,audi,1,0
3,A4,2017,16800,Automatic,25952,Diesel,145.0,67.3,2.0,audi,1,0
4,A3,2019,17300,Manual,1998,Petrol,145.0,49.6,1.0,audi,1,0


# Splitting Data

In [5]:
x = df_clean.drop(columns=['price'])
y = df_clean['price']

In [6]:
x_train, x_test, y_train, y_test = train_test_split(
    x,
    y,
    random_state=2020
)

In [7]:
# data train dibagi lagi menjadi data train dan data validation
x2_train, x_val, y2_train, y_val = train_test_split(
    x_train,
    y_train,
    random_state=2020
)

# Hyperparameter Tuning

In [8]:
def getEstimatorResult(estimator, iterasi):
    global cur_best_mae, cur_best_mape
    y_pred = estimator.predict(x_val)
    test_mae = mean_absolute_error(y_val, y_pred)

    if test_mae < cur_best_mae:
        cur_best_mae = test_mae
        cur_best_mape = mean_absolute_percentage_error(y_val, y_pred)*100
        print(f'iterasi : {iterasi}, cur_best_mae: {cur_best_mae}')

    return{'loss':cur_best_mae, 'status': STATUS_OK }

def objRandomForest(space):
    global obj_call_count
    obj_call_count += 1
    
    model = RandomForestRegressor(
        max_depth=space['max_depth'], 
        min_samples_split= int(space['min_samples_split']),
        min_samples_leaf= int(space['min_samples_leaf']),
        random_state=2020
    )
    
    estimator = Pipeline([
                    ('preprocess', transformer),
                    ('model', model)
                ])
    estimator.fit(x2_train, y2_train)
    
    return getEstimatorResult(estimator, obj_call_count)

def objCatBoost(space):
    global obj_call_count
    obj_call_count += 1
    
    model = CatBoostRegressor(
        learning_rate=space['learning_rate'], 
        depth=int(space['depth']), 
        silent=True, 
        random_state=2020
    )
    
    estimator = Pipeline([
                    ('preprocess', transformer),
                    ('model', model)
                ])
    estimator.fit(x2_train, y2_train)
    
    return getEstimatorResult(estimator, obj_call_count)

def objCatBoostCategory(space):
    global obj_call_count
    obj_call_count += 1
    
    estimator = CatBoostRegressor(
        learning_rate=space['learning_rate'], 
        depth=int(space['depth']), 
        cat_features=['model', 'transmission', 'fuelType', 'brand'],
        silent=True, 
        random_state=2020
    )
    estimator.fit(x2_train, y2_train)
    
    return getEstimatorResult(estimator, obj_call_count)
    

In [9]:
models = [{
    'name' : 'Random Forest',
    'objective': objRandomForest,
    'space' : { 
        'max_depth': hp.quniform('max_depth', 5, 9, 1), # nilai discret / nilai yang berurutan, dari 5 sampai 9, stepnya 1 >> 5,6,7,8,9
        'min_samples_split': hp.quniform('min_samples_split', 2, 6, 1),
        'min_samples_leaf': hp.quniform('min_samples_leaf', 1, 5, 1)
    }
}, {
    'name' : 'CatBoost',
    'objective': objCatBoost,
    'space' : { 
        'depth': hp.quniform('depth', 5, 9, 1), # nilai discret / nilai yang berurutan, dari 5 sampai 9, stepnya 1 >> 5,6,7,8,9
        'learning_rate': hp.loguniform('learning_rate',  -5.0, -2) # hasil nilai exponen dari np.exp(-5) atau 2.71828**-5 = 0.006 sampai np.exp(-2) atau  2.71828**-2= 0.135
    }
}, {
    'name' : 'CatBoost Category',
    'objective': objCatBoostCategory,
    'space' : { 
        'depth': hp.quniform('depth', 5, 9, 1), # nilai discret / nilai yang berurutan, dari 5 sampai 9, stepnya 1 >> 5,6,7,8,9
        'learning_rate': hp.loguniform('learning_rate',  -5.0, -2) # hasil nilai exponen dari np.exp(-5) atau 2.71828**-5 = 0.006 sampai np.exp(-2) atau  2.71828**-2= 0.135
    }
}]

model_name = []
model_param = []
model_mae_score = []
model_mape_score = []
exec_time = []

N_HYPEROPT_PROBES = 100 # jumlah iterasi
HYPEROPT_ALGO = tpe.suggest # parameter standard untuk random search

for i in models:
    start_time = time.time()
    cur_best_mae = np.inf # untuk simpan nilai MAE terkecil/terbaik
    cur_best_mape = np.inf # untuk simpan nilai MAE terkecil/terbaik
    obj_call_count = 0
    
    print(i['name'])
    
    trials = Trials()
    best = hyperopt.fmin(
        fn=i['objective'], # This function will be called with a value generated from `space`
        space=i['space'], # set of possible arguments to `fn`
        algo=HYPEROPT_ALGO, # search algorithm
        max_evals=N_HYPEROPT_PROBES,
        trials=trials, # Allow up to this many function evaluations before returning
        rstate= np.random.RandomState(42), # random state, agar hasil best parameter tidak berubah setiap kali dirun
        verbose=True
    )
    
    model_name.append(i['name'])
    model_param.append(best)
    model_mae_score.append(cur_best_mae)
    model_mape_score.append(cur_best_mape)
    exec_time.append(time.time() - start_time)
    
    print('='*100)
    

Random Forest
iterasi : 1, cur_best_mae: 2333.6811158980076          
iterasi : 2, cur_best_mae: 1593.5469664639254                                    
iterasi : 13, cur_best_mae: 1591.3950412511085                                    
iterasi : 21, cur_best_mae: 1590.4838457539856                                    
100%|██████████| 100/100 [25:22<00:00, 15.22s/trial, best loss: 1590.4838457539856]
CatBoost
iterasi : 1, cur_best_mae: 1381.4611140491331          
iterasi : 3, cur_best_mae: 1263.6392523184331                                    
iterasi : 10, cur_best_mae: 1261.5839245685559                                   
iterasi : 66, cur_best_mae: 1257.9197108161716                                    
iterasi : 71, cur_best_mae: 1249.6949267880575                                    
iterasi : 72, cur_best_mae: 1238.2905097208195                                    
iterasi : 86, cur_best_mae: 1237.6786611056805                                    
100%|██████████| 100/100 [17:53<00:00

In [10]:
modelComparison=pd.DataFrame({
    'model': model_name,
    'best_param': model_param,
    'mae_score': model_mae_score,
    'mape_score': model_mape_score,
    'exec_time': exec_time
})
modelComparison

Unnamed: 0,model,best_param,mae_score,mape_score,exec_time
0,Random Forest,"{'max_depth': 9.0, 'min_samples_leaf': 3.0, 'min_samples_split': 3.0}",1590.483846,10.074372,1522.083928
1,CatBoost,"{'depth': 9.0, 'learning_rate': 0.0941932282877387}",1237.678661,7.690776,1073.260708
2,CatBoost Category,"{'depth': 9.0, 'learning_rate': 0.12010440689306091}",1148.555457,7.166322,3695.732922


Dari hasil feature engineering dan optimisasi parameter dengan hyperopt kita mendapatkan, penurunan performa pada Random Forest, dan peningkatan performa pada catboost & catboost category. Dari hasil diatas kita dapat memutuskan model terbaik yang akan kita gunakan adalah Catboost Category dengan Skor MAE 1148.5 atau persentase kesalahan prediksi sebesar 7.16% 