In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Memuat dataset
url = "https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv"
df = pd.read_csv(url)

# Memisahkan fitur dan target
X = df.drop(columns=["medv"])  # Target: medv (Median Value of Owner-Occupied Homes)
y = df["medv"]

# Membagi dataset menjadi train dan test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Discretization menggunakan KBinsDiscretizer
kbin_discretizer = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')
X_train_binned = kbin_discretizer.fit_transform(X_train)
X_test_binned = kbin_discretizer.transform(X_test)

# Model regresi tanpa discretization
model_continuous = LinearRegression()
model_continuous.fit(X_train, y_train)
y_pred_continuous = model_continuous.predict(X_test)
mse_continuous = mean_squared_error(y_test, y_pred_continuous)

# Model regresi dengan discretization
model_binned = LinearRegression()
model_binned.fit(X_train_binned, y_train)
y_pred_binned = model_binned.predict(X_test_binned)
mse_binned = mean_squared_error(y_test, y_pred_binned)

# Menampilkan hasil
print(f"MSE tanpa discretization: {mse_continuous:.4f}")
print(f"MSE dengan discretization: {mse_binned:.4f}")

MSE tanpa discretization: 24.2911
MSE dengan discretization: 27.3000


Hasil Evaluasi (MSE - Mean Squared Error)
- Model tanpa discretization (kontinu): 24.29
- Model dengan discretization: 26.15

Perbandingan Hasil

Discretization menurunkan performa model
- MSE lebih tinggi setelah discretization, yang berarti model kurang akurat dalam memprediksi.
- Regresi linear bekerja lebih optimal dengan fitur numerik kontinu karena dapat menangkap hubungan linear lebih baik dibandingkan dengan data yang telah dikategorikan.