In [18]:
# Import library yang diperlukan
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import shap
from sklearn.inspection import PartialDependenceDisplay
# Memuat dataset
file_path = 'melb_data.csv'
data = pd.read_csv(file_path)

# Menampilkan beberapa baris awal
print(data.head())



       Suburb           Address  Rooms Type      Price Method SellerG  \
0  Abbotsford      85 Turner St      2    h  1480000.0      S  Biggin   
1  Abbotsford   25 Bloomburg St      2    h  1035000.0      S  Biggin   
2  Abbotsford      5 Charles St      3    h  1465000.0     SP  Biggin   
3  Abbotsford  40 Federation La      3    h   850000.0     PI  Biggin   
4  Abbotsford       55a Park St      4    h  1600000.0     VB  Nelson   

        Date  Distance  Postcode  ...  Bathroom  Car  Landsize  BuildingArea  \
0  3/12/2016       2.5    3067.0  ...       1.0  1.0     202.0           NaN   
1  4/02/2016       2.5    3067.0  ...       1.0  0.0     156.0          79.0   
2  4/03/2017       2.5    3067.0  ...       2.0  0.0     134.0         150.0   
3  4/03/2017       2.5    3067.0  ...       2.0  1.0      94.0           NaN   
4  4/06/2016       2.5    3067.0  ...       1.0  2.0     120.0         142.0   

   YearBuilt  CouncilArea Lattitude  Longtitude             Regionname  \
0     

In [19]:
# Menghapus baris dengan nilai kosong
data = data.dropna()

# Memisahkan fitur dan target
X = data.drop(columns=['Price'])  # Asumsikan kolom 'Price' adalah target
y = data['Price']

# Membagi data menjadi train-test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [20]:
# Menentukan kolom bertipe object
categorical_columns = X.select_dtypes(include=['object']).columns

# Konversi kolom kategori menjadi kategori pandas
for col in categorical_columns:
    X[col] = X[col].astype('category')

# Membagi data menjadi train-test setelah konversi
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [21]:
dtrain = xgb.DMatrix(X_train, label=y_train, enable_categorical=True)
dtest = xgb.DMatrix(X_test, label=y_test, enable_categorical=True)
print(X_train.dtypes)


Suburb           category
Address          category
Rooms               int64
Type             category
Method           category
SellerG          category
Date             category
Distance          float64
Postcode          float64
Bedroom2          float64
Bathroom          float64
Car               float64
Landsize          float64
BuildingArea      float64
YearBuilt         float64
CouncilArea      category
Lattitude         float64
Longtitude        float64
Regionname       category
Propertycount     float64
dtype: object


In [22]:
for col in categorical_columns:
    X_train[col] = X_train[col].astype('category')
    X_test[col] = X_test[col].astype('category')


In [23]:
# One-Hot Encoding untuk kolom kategori
X_encoded = pd.get_dummies(X, columns=['Suburb', 'Address', 'Type', 'Method', 
                                       'SellerG', 'Date', 'CouncilArea', 'Regionname'])

# Membagi data yang sudah di-encode
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Membuat DMatrix tanpa enable_categorical
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)


In [24]:
# Memuat model yang telah dilatih

from xgboost import XGBRegressor, plot_importance
import matplotlib.pyplot as plt

model = XGBRegressor()
model.load_model('model_path.json')  # Ganti dengan path model Anda

# Menampilkan feature importance
plot_importance(model)
plt.title('Feature Importance')
plt.show()



XGBoostError: [14:05:23] C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0ed59c031377d09b8-1\xgboost\xgboost-ci-windows\src\common\io.cc:147: Opening model_path.json failed: The system cannot find the file specified.

In [None]:
# Menampilkan pohon pertama
xgb.plot_tree(model, num_trees=0)
plt.rcParams['figure.figsize'] = [50, 10]
plt.show()


NameError: name 'model' is not defined

In [None]:
# Menyiapkan evaluasi selama pelatihan
evals_result = {}
model = xgb.train(
    params, dtrain, num_boost_round=100, evals=[(dtrain, 'train'), (dtest, 'test')],
    evals_result=evals_result, verbose_eval=False
)

# Plot learning curve
epochs = len(evals_result['train']['rmse'])
x_axis = range(0, epochs)
plt.plot(x_axis, evals_result['train']['rmse'], label='Train')
plt.plot(x_axis, evals_result['test']['rmse'], label='Test')
plt.legend()
plt.xlabel('Boosting Rounds')
plt.ylabel('RMSE')
plt.title('Learning Curve')
plt.show()


In [None]:
# Menggunakan SHAP untuk analisis
explainer = shap.Explainer(model, X_test)
shap_values = explainer(X_test)

# Visualisasi SHAP Summary
shap.summary_plot(shap_values, X_test)


In [None]:
# Menampilkan PDP untuk fitur pertama
feature_name = X.columns[0]
PartialDependenceDisplay.from_estimator(model, X_test, [feature_name])
plt.title(f"Partial Dependence Plot of {feature_name}")
plt.show()
