<a href="https://colab.research.google.com/github/raflinoob132/MachineLearning-1/blob/main/Week%202/Regression%20Boston%20Housing/Collab%20Notebook/Discretization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import KBinsDiscretizer, OneHotEncoder

# Load dataset
url = "https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv"
df = pd.read_csv(url)

# Pisahkan fitur (X) dan target (y)
X = df.drop('medv', axis=1)
y = df['medv']

# Bagi data menjadi training dan testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Discretization menggunakan KBinsDiscretizer
# Contoh: Discretize fitur 'rm' (jumlah kamar) menjadi 5 kategori
discretizer = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')
X_train_discrete = X_train.copy()
X_test_discrete = X_test.copy()

# Fitur yang akan didiscretisasi (misalnya, 'rm', 'lstat', 'age')
features_to_discretize = ['rm', 'lstat', 'age']

for feature in features_to_discretize:
    X_train_discrete[feature] = discretizer.fit_transform(X_train[[feature]]).flatten()
    X_test_discrete[feature] = discretizer.transform(X_test[[feature]]).flatten()

# Encoding kategori menggunakan OneHotEncoder
encoder = OneHotEncoder(sparse_output=False, drop='first')  # drop='first' untuk menghindari multicollinearity
X_train_encoded = encoder.fit_transform(X_train_discrete[features_to_discretize])
X_test_encoded = encoder.transform(X_test_discrete[features_to_discretize])

# Gabungkan fitur yang didiscretisasi dengan fitur lainnya
X_train_final = np.hstack((X_train_discrete.drop(features_to_discretize, axis=1).values, X_train_encoded))
X_test_final = np.hstack((X_test_discrete.drop(features_to_discretize, axis=1).values, X_test_encoded))

# Latih model dengan fitur kontinu (baseline)
model_continuous = LinearRegression()
model_continuous.fit(X_train, y_train)
y_pred_continuous = model_continuous.predict(X_test)
mse_continuous = mean_squared_error(y_test, y_pred_continuous)
r2_continuous = r2_score(y_test, y_pred_continuous)

# Latih model dengan fitur yang didiscretisasi
model_discrete = LinearRegression()
model_discrete.fit(X_train_final, y_train)
y_pred_discrete = model_discrete.predict(X_test_final)
mse_discrete = mean_squared_error(y_test, y_pred_discrete)
r2_discrete = r2_score(y_test, y_pred_discrete)

# Bandingkan hasil
print("Model dengan Fitur Kontinu:")
print(f"  MSE: {mse_continuous:.4f}")
print(f"  R²: {r2_continuous:.4f}")
print()
print("Model dengan Fitur Discretisasi:")
print(f"  MSE: {mse_discrete:.4f}")
print(f"  R²: {r2_discrete:.4f}")

Model dengan Fitur Kontinu:
  MSE: 24.2911
  R²: 0.6688

Model dengan Fitur Discretisasi:
  MSE: 17.0118
  R²: 0.7680
