In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_percentage_error
import joblib


In [2]:
# Ganti path di bawah sesuai nama folder unggahan kamu
file_path = "/kaggle/input/datarumah-jakarta/data_rumah_jakarta.xlsx"
data = pd.read_excel(file_path)

# Cek isi data
print(data.head())

   price_in_rp  district            city  bedrooms  bathrooms  land_size_m2  \
0   6800000000  Jelambar   Jakarta Barat         4          3           260   
1   1650000000  Jelambar   Jakarta Barat         3          3            60   
2   2400000000  Jelambar   Jakarta Barat         5          4            89   
3   3550000000  Jelambar   Jakarta Barat         4          2           112   
4   2400000000  Jelambar   Jakarta Barat         5          2            84   

   building_size_m2  carports  electricity  maid_bedrooms  maid_bathrooms  \
0               387         0     5500 mah              1               1   
1               132         0     2200 mah              0               0   
2               227         0  lainnya mah              0               0   
3               160         0  lainnya mah              0               0   
4               144         0  lainnya mah              0               0   

   floors property_condition  garages  
0       2             

In [3]:
# Pastikan kolom target ('price_in_rp') dan kolom kategorikal disebutkan
categorical_features = ['district', 'city', 'property_condition', 'electricity']

# Konversi kolom kategorikal menjadi tipe kategori
for col in categorical_features:
    data[col] = data[col].astype('category')

# One-hot encoding pada fitur kategorikal
data_encoded = pd.get_dummies(data, columns=categorical_features, drop_first=True)

In [4]:
# Misal target variabel adalah 'harga'
X = data_encoded.drop(columns='price_in_rp')
y = data_encoded['price_in_rp']

# Split data 70% train dan 30% test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, random_state=42
)

In [5]:
dt = DecisionTreeRegressor(random_state=42)
params_dt = {'max_depth': [5, 10, 15, 20]}
grid_dt = GridSearchCV(dt, params_dt, cv=5, scoring='r2')
grid_dt.fit(X_train, y_train)
best_dt = grid_dt.best_estimator_
joblib.dump(best_dt, 'model_decision_tree.pkl')


['model_decision_tree.pkl']

In [6]:
rf = RandomForestRegressor(random_state=42)
params_rf = {'n_estimators': [100, 200], 'max_depth': [10, 15, 20]}
grid_rf = GridSearchCV(rf, params_rf, cv=5, scoring='r2')
grid_rf.fit(X_train, y_train)
best_rf = grid_rf.best_estimator_
joblib.dump(best_rf, 'model_random_forest.pkl')


['model_random_forest.pkl']

In [7]:
xgb = XGBRegressor(random_state=42, objective='reg:squarederror')
params_xgb = {'n_estimators': [100, 200], 'max_depth': [3, 6], 'learning_rate': [0.05, 0.1]}
grid_xgb = GridSearchCV(xgb, params_xgb, cv=5, scoring='r2')
grid_xgb.fit(X_train, y_train)
best_xgb = grid_xgb.best_estimator_
joblib.dump(best_xgb, 'model_xgboost.pkl')


['model_xgboost.pkl']

In [8]:
pred_dt = best_dt.predict(X_test)
pred_rf = best_rf.predict(X_test)
pred_xgb = best_xgb.predict(X_test)

# Hitung bobot optimal berdasarkan RMSE invers
def get_weight(rmse_list):
    inverse_rmse = 1 / np.array(rmse_list)
    return inverse_rmse / inverse_rmse.sum()

rmse_dt = np.sqrt(mean_squared_error(y_test, pred_dt))
rmse_rf = np.sqrt(mean_squared_error(y_test, pred_rf))
rmse_xgb = np.sqrt(mean_squared_error(y_test, pred_xgb))

weights = get_weight([rmse_dt, rmse_rf, rmse_xgb])
print("Bobot Optimal:", weights)

pred_ensemble = (weights[0]*pred_dt + weights[1]*pred_rf + weights[2]*pred_xgb)


Bobot Optimal: [0.20267247 0.36978243 0.4275451 ]


In [9]:
def evaluate_model(model, X, y):
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    r2 = cross_val_score(model, X, y, cv=kf, scoring='r2').mean()
    mse = -cross_val_score(model, X, y, cv=kf, scoring='neg_mean_squared_error').mean()
    rmse = np.sqrt(mse)
    mape = cross_val_score(model, X, y, cv=kf, scoring='neg_mean_absolute_percentage_error').mean() * -1
    return r2, rmse, mse, mape

models = {
    'Decision Tree': best_dt,
    'Random Forest': best_rf,
    'XGBoost': best_xgb
}

for name, model in models.items():
    r2, rmse, mse, mape = evaluate_model(model, X, y)
    print(f"{name} -> R2: {r2:.4f}, RMSE: {rmse:.4f}, MSE: {mse:.4f}, MAPE: {mape:.4f}")

# Evaluasi model ensemble manual
def evaluate_ensemble(preds, y_true):
    r2 = r2_score(y_true, preds)
    rmse = np.sqrt(mean_squared_error(y_true, preds))
    mse = mean_squared_error(y_true, preds)
    mape = mean_absolute_percentage_error(y_true, preds)
    return r2, rmse, mse, mape

r2_e, rmse_e, mse_e, mape_e = evaluate_ensemble(pred_ensemble, y_test)
print(f"Weighted Model Averaging (Inverse Method) -> R2: {r2_e:.4f}, RMSE: {rmse_e:.4f}, MSE: {mse_e:.4f}, MAPE: {mape_e:.4f}")


Decision Tree -> R2: 0.3613, RMSE: 15379327821.8082, MSE: 236523724250644578304.0000, MAPE: 0.2841
Random Forest -> R2: 0.6586, RMSE: 11300840209.3216, MSE: 127708989436619964416.0000, MAPE: 0.2345
XGBoost -> R2: 0.7039, RMSE: 10101132714.1351, MSE: 102032882108571041792.0000, MAPE: 0.3005
Weighted Model Averaging (Inverse Method) -> R2: 0.8301, RMSE: 6798269306.8421, MSE: 46216465568350945280.0000, MAPE: 0.2316


In [10]:
from itertools import product

# Prediksi masing-masing model
pred_dt = best_dt.predict(X_test)
pred_rf = best_rf.predict(X_test)
pred_xgb = best_xgb.predict(X_test)
all_preds = np.vstack([pred_dt, pred_rf, pred_xgb])

# Buat grid bobot (misalnya interval 0.05)
grid_range = np.arange(0, 1.05, 0.05)
best_rmse = float('inf')
best_weights = None

for w1, w2 in product(grid_range, repeat=2):
    w3 = 1 - w1 - w2
    if 0 <= w3 <= 1:
        weights = [w1, w2, w3]
        ensemble_pred = np.dot(weights, all_preds)
        rmse = np.sqrt(mean_squared_error(y_test, ensemble_pred))
        if rmse < best_rmse:
            best_rmse = rmse
            best_weights = weights

print("Bobot optimal (grid search):", best_weights)
print("RMSE Ensemble (grid):", best_rmse)

# Final prediction
pred_ensemble_grid = np.dot(best_weights, all_preds)

# Evaluasi akhir
r2_e, rmse_e, mse_e, mape_e = evaluate_ensemble(pred_ensemble_grid, y_test)
print(f"Weighted Model Averaging (Grid Search) -> R2: {r2_e:.4f}, RMSE: {rmse_e:.4f}, MSE: {mse_e:.4f}, MAPE: {mape_e:.4f}")


Bobot optimal (grid search): [0.0, 0.1, 0.9]
RMSE Ensemble (grid): 5842180959.981659
Weighted Model Averaging (Grid Search) -> R2: 0.8745, RMSE: 5842180959.9817, MSE: 34131078369172213760.0000, MAPE: 0.2447


In [11]:
for name, model in models.items():
    r2, rmse, mse, mape = evaluate_model(model, X, y)
    print(f"{name} -> R2: {r2:.4f}, RMSE: {rmse:.4f}, MSE: {mse:.4f}, MAPE: {mape:.4f}")



print(f"Weighted Model Averaging (Grid Search) -> R2: {r2_e:.4f}, RMSE: {rmse_e:.4f}, MSE: {mse_e:.4f}, MAPE: {mape_e:.4f}")



Decision Tree -> R2: 0.3613, RMSE: 15379327821.8082, MSE: 236523724250644578304.0000, MAPE: 0.2841
Random Forest -> R2: 0.6586, RMSE: 11300840209.3216, MSE: 127708989436619964416.0000, MAPE: 0.2345
XGBoost -> R2: 0.7039, RMSE: 10101132714.1351, MSE: 102032882108571041792.0000, MAPE: 0.3005
Weighted Model Averaging (Grid Search) -> R2: 0.8745, RMSE: 5842180959.9817, MSE: 34131078369172213760.0000, MAPE: 0.2447
