# Charaland Price Prediction - Web Version


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_absolute_error
from xgboost import XGBRegressor
import joblib
import warnings
warnings.filterwarnings("ignore")

## Load Dataset

In [2]:
df = pd.read_csv("jabodetabek_house_price.csv")

print("Dataset shape:", df.shape)
df.head()

Dataset shape: (3553, 27)


Unnamed: 0,url,price_in_rp,title,address,district,city,lat,long,facilities,property_type,...,electricity,maid_bedrooms,maid_bathrooms,floors,building_age,year_built,property_condition,building_orientation,garages,furnishing
0,https://www.rumah123.com/properti/bekasi/hos11...,2990000000.0,Rumah cantik Sumarecon Bekasi\nLingkungan asri...,"Summarecon Bekasi, Bekasi",Summarecon Bekasi,Bekasi,-6.223945,106.986275,"Tempat Jemuran, Jalur Telepon, Taman, Taman",rumah,...,4400 mah,0.0,1.0,2.0,5.0,2017.0,bagus,,0.0,unfurnished
1,https://www.rumah123.com/properti/bekasi/hos10...,1270000000.0,"Rumah Kekinian, Magenta Summarecon Bekasi","Summarecon Bekasi, Bekasi",Summarecon Bekasi,Bekasi,-6.223945,106.986275,Taman,rumah,...,2200 mah,0.0,0.0,2.0,,,bagus,,0.0,
2,https://www.rumah123.com/properti/bekasi/hos10...,1950000000.0,Rumah Cantik 2 Lantai Cluster Bluebell Summare...,"Summarecon Bekasi, Bekasi",Summarecon Bekasi,Bekasi,-6.223945,106.986275,"Jogging Track, Kolam Renang, Masjid, Taman,...",rumah,...,2200 mah,1.0,1.0,2.0,,,bagus,,1.0,unfurnished
3,https://www.rumah123.com/properti/bekasi/hos10...,3300000000.0,Rumah Mewah 2Lantai L10x18 C di Cluster VERNON...,"Summarecon Bekasi, Bekasi",Summarecon Bekasi,Bekasi,-6.223945,106.986275,"Jalur Telepon, Jogging Track, Track Lari, K...",rumah,...,3500 mah,1.0,1.0,2.0,6.0,2016.0,bagus sekali,utara,2.0,unfurnished
4,https://www.rumah123.com/properti/bekasi/hos10...,4500000000.0,"Rumah Hoek di Cluster Maple Summarecon Bekasi,...","Summarecon Bekasi, Bekasi",Summarecon Bekasi,Bekasi,-6.223945,106.986275,"Jogging Track, Kolam Renang, Taman, Jalur Te...",rumah,...,3500 mah,1.0,1.0,2.0,9.0,2013.0,bagus,utara,1.0,unfurnished


## Basic Data Check

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3553 entries, 0 to 3552
Data columns (total 27 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   url                   3553 non-null   object 
 1   price_in_rp           3553 non-null   float64
 2   title                 3553 non-null   object 
 3   address               3553 non-null   object 
 4   district              3553 non-null   object 
 5   city                  3553 non-null   object 
 6   lat                   3553 non-null   float64
 7   long                  3553 non-null   float64
 8   facilities            3553 non-null   object 
 9   property_type         3552 non-null   object 
 10  ads_id                3549 non-null   object 
 11  bedrooms              3519 non-null   float64
 12  bathrooms             3524 non-null   float64
 13  land_size_m2          3551 non-null   float64
 14  building_size_m2      3551 non-null   float64
 15  carports             

In [4]:
df.isna().sum().sort_values(ascending=False)

building_orientation    1647
year_built              1445
building_age            1445
furnishing               387
property_condition       246
certificate              141
bedrooms                  34
bathrooms                 29
floors                     6
ads_id                     4
building_size_m2           2
land_size_m2               2
property_type              1
price_in_rp                0
facilities                 0
carports                   0
long                       0
electricity                0
maid_bedrooms              0
maid_bathrooms             0
lat                        0
city                       0
district                   0
address                    0
title                      0
garages                    0
url                        0
dtype: int64

## Select Features & Target

In [5]:
TARGET = "price_in_rp"

NUM_FEATURES = [
    "building_size_m2",
    "land_size_m2",
    "bedrooms",
    "bathrooms",
    "carports",
    "garages",
    "floors",
    "lat",
    "long"
]

CAT_FEATURES = [
    "city",
    "district",
    "certificate",
    "furnishing",
    "property_condition",
    "property_type",
    "electricity"
]

FEATURES = NUM_FEATURES + CAT_FEATURES

df_model = df[FEATURES + [TARGET]].copy()
df_model.head()

Unnamed: 0,building_size_m2,land_size_m2,bedrooms,bathrooms,carports,garages,floors,lat,long,city,district,certificate,furnishing,property_condition,property_type,electricity,price_in_rp
0,272.0,239.0,4.0,4.0,0.0,0.0,2.0,-6.223945,106.986275,Bekasi,Summarecon Bekasi,shm - sertifikat hak milik,unfurnished,bagus,rumah,4400 mah,2990000000.0
1,69.0,55.0,3.0,2.0,1.0,0.0,2.0,-6.223945,106.986275,Bekasi,Summarecon Bekasi,hgb - hak guna bangunan,,bagus,rumah,2200 mah,1270000000.0
2,131.0,119.0,3.0,3.0,1.0,1.0,2.0,-6.223945,106.986275,Bekasi,Summarecon Bekasi,hgb - hak guna bangunan,unfurnished,bagus,rumah,2200 mah,1950000000.0
3,174.0,180.0,3.0,3.0,0.0,2.0,2.0,-6.223945,106.986275,Bekasi,Summarecon Bekasi,shm - sertifikat hak milik,unfurnished,bagus sekali,rumah,3500 mah,3300000000.0
4,196.0,328.0,4.0,3.0,2.0,1.0,2.0,-6.223945,106.986275,Bekasi,Summarecon Bekasi,shm - sertifikat hak milik,unfurnished,bagus,rumah,3500 mah,4500000000.0


## Handle Missing Values

In [6]:
for col in NUM_FEATURES:
    df_model[col] = df_model[col].fillna(df_model[col].median())


for col in CAT_FEATURES:
    df_model[col] = df_model[col].fillna("Unknown")

df_model.isna().sum()

building_size_m2      0
land_size_m2          0
bedrooms              0
bathrooms             0
carports              0
garages               0
floors                0
lat                   0
long                  0
city                  0
district              0
certificate           0
furnishing            0
property_condition    0
property_type         0
electricity           0
price_in_rp           0
dtype: int64

## Remove Extreme Outliers

In [7]:
q_low = df_model[TARGET].quantile(0.01)
q_high = df_model[TARGET].quantile(0.99)

df_model = df_model[
    (df_model[TARGET] >= q_low) &
    (df_model[TARGET] <= q_high)
]

print("Shape after outlier removal:", df_model.shape)

Shape after outlier removal: (3482, 17)


## Split Features & Log Transform Target

In [8]:
X = df_model[FEATURES]
y = np.log1p(df_model[TARGET])

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)

## Preprocessing Pipeline

In [9]:
numeric_transformer = "passthrough"

categorical_transformer = OneHotEncoder(
    handle_unknown="ignore",
    sparse_output=False
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, NUM_FEATURES),
        ("cat", categorical_transformer, CAT_FEATURES)
    ]
)

## Build Model Pipeline (XGBoost)

In [10]:
model = XGBRegressor(
    n_estimators=600,
    max_depth=8,
    learning_rate=0.03,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.5,
    reg_lambda=1.5,
    random_state=42,
    n_jobs=-1
)

pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("model", model)
    ]
)

## Train Model

In [11]:
pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


## Evaluasi

In [12]:
y_pred_log = pipeline.predict(X_test)
y_pred = np.expm1(y_pred_log)
y_true = np.expm1(y_test)

r2 = r2_score(y_true, y_pred)
mae = mean_absolute_error(y_true, y_pred)

print(f"R2 Score : {r2:.4f}")
print(f"MAE      : Rp {mae:,.0f}")

R2 Score : 0.8574
MAE      : Rp 658,713,555


## Save Pipeline

In [13]:
joblib.dump(pipeline, "pipeline_house_price.joblib")
print("Model saved as pipeline_house_price.joblib")

Model saved as pipeline_house_price.joblib
