In [1]:
import pandas as pd
from pycaret.regression import *

df = pd.read_csv("../../data/rumah123_cleaned.csv")

df = df.drop(columns=["Unnamed: 0", "name"], errors='ignore')
df

Unnamed: 0,price,location,bedroom,bathroom,land_area,building_area
0,1680.0,"Rungkut, Surabaya",3,3,67,120
1,950.0,"Medokan Ayu, Surabaya",3,3,72,140
2,500.0,"Wonorejo, Surabaya",2,1,30,36
3,950.0,"Sukolilo, Surabaya",2,2,50,55
4,600.0,"Kebraon, Surabaya",2,1,50,56
...,...,...,...,...,...,...
25930,1600.0,"Gununganyar, Surabaya",3,1,144,70
25931,1700.0,"Mulyorejo, Surabaya",5,2,120,200
25932,1300.0,"Wonorejo, Surabaya",5,3,120,180
25933,225.0,"Diponegoro, Surabaya",6,4,642,600


In [2]:
df.duplicated().sum()

9650

In [3]:
df = df.drop_duplicates()

In [4]:
df.duplicated().sum()

0

In [5]:
# import ydata_profiling as pp

# profile = pp.ProfileReport(df)
# profile.to_file('Report_Rumah123.html')

In [6]:
setup(
    df, 
    target='price',
    transform_target=True, 
    normalize=True, 
    normalize_method='minmax',
    remove_outliers=True, 
    outliers_threshold=0.05,
    categorical_features=['location'],
    session_id=42
)

Unnamed: 0,Description,Value
0,Session id,42
1,Target,price
2,Target type,Regression
3,Original data shape,"(16285, 6)"
4,Transformed data shape,"(15715, 6)"
5,Transformed train set shape,"(10829, 6)"
6,Transformed test set shape,"(4886, 6)"
7,Numeric features,4
8,Categorical features,1
9,Preprocess,True


<pycaret.regression.oop.RegressionExperiment at 0x274784673d0>

In [7]:
best = compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lightgbm,Light Gradient Boosting Machine,217.2338,81276.9887,284.9577,0.6406,0.2683,0.2172,0.254
catboost,CatBoost Regressor,217.5036,81501.5952,285.3471,0.6394,0.2672,0.2139,1.251
rf,Random Forest Regressor,211.3045,81639.7906,285.6089,0.6389,0.2653,0.207,0.391
xgboost,Extreme Gradient Boosting,214.6613,82332.7141,286.7628,0.6357,0.2677,0.2091,0.089
et,Extra Trees Regressor,215.539,86155.9558,293.4283,0.619,0.2731,0.2112,0.319
gbr,Gradient Boosting Regressor,228.7527,88176.4494,296.7974,0.61,0.2801,0.2281,0.146
knn,K Neighbors Regressor,237.0416,98820.7273,314.1534,0.563,0.2915,0.2353,0.061
ada,AdaBoost Regressor,278.1809,113071.724,336.2154,0.5001,0.3204,0.2837,0.109
dt,Decision Tree Regressor,256.8898,133144.7545,364.8197,0.4111,0.3364,0.2441,0.061
dummy,Dummy Regressor,412.4036,226472.9359,475.8581,-0.0011,0.4586,0.458,0.07


In [8]:
tuned_model = tune_model(best, search_library='scikit-optimize', search_algorithm='bayesian')

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,204.3913,72354.9339,268.9887,0.6697,0.2444,0.1905
1,211.83,78469.3739,280.1239,0.6378,0.2602,0.2099
2,213.2902,83014.1209,288.1217,0.6444,0.28,0.2249
3,212.7503,79353.6292,281.6978,0.6542,0.259,0.2062
4,222.8607,83598.3306,289.1338,0.6356,0.2764,0.2263
5,209.4171,75736.7643,275.2031,0.6747,0.2525,0.2046
6,203.5958,71116.6978,266.6771,0.6905,0.2489,0.1989
7,214.5294,83781.0844,289.4496,0.6223,0.2753,0.2164
8,216.5194,83807.9552,289.496,0.6268,0.2865,0.2304
9,212.0852,77893.3707,279.0938,0.6539,0.2595,0.2078


Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits


In [9]:
holdout_pred = predict_model(tuned_model)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Light Gradient Boosting Machine,209.6404,77218.5533,277.8823,0.6586,0.2616,0.2081




In [10]:
new_data = df.copy()
new_data.drop('price', axis=1, inplace=True)
new_data.head()

Unnamed: 0,location,bedroom,bathroom,land_area,building_area
0,"Rungkut, Surabaya",3,3,67,120
1,"Medokan Ayu, Surabaya",3,3,72,140
2,"Wonorejo, Surabaya",2,1,30,36
3,"Sukolilo, Surabaya",2,2,50,55
4,"Kebraon, Surabaya",2,1,50,56


In [11]:
predictions = predict_model(tuned_model, data = new_data)
predictions.head()



Unnamed: 0,location,bedroom,bathroom,land_area,building_area,prediction_label
0,"Rungkut, Surabaya",3,3,67,120,1263.2562
1,"Medokan Ayu, Surabaya",3,3,72,140,1205.837301
2,"Wonorejo, Surabaya",2,1,30,36,408.374002
3,"Sukolilo, Surabaya",2,2,50,55,916.593588
4,"Kebraon, Surabaya",2,1,50,56,652.589755


In [12]:
# save_model(tuned_model, 'house-price-rf')