## **1. Transformasi Data**

In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import PowerTransformer, QuantileTransformer, FunctionTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Load dataset dari URL
file_path = "https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv"
df = pd.read_csv(file_path)

# Memisahkan fitur (X) dan target (y)
X = df.drop(columns=["medv"])  # 'medv' adalah harga rumah (target)
y = df["medv"]

# Membagi dataset menjadi training dan testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Dictionary untuk menyimpan hasil transformasi
results = {}

# Fungsi untuk menerapkan transformasi dan melatih model
def train_model(transformer, name, X_train, X_test):
    X_train_transformed = transformer.fit_transform(X_train)
    X_test_transformed = transformer.transform(X_test)

    model = LinearRegression()
    model.fit(X_train_transformed, y_train)
    y_pred = model.predict(X_test_transformed)

    mse = mean_squared_error(y_test, y_pred)
    results[name] = mse

# 1. Yeo-Johnson Transform
train_model(PowerTransformer(method='yeo-johnson'), "Yeo-Johnson", X_train, X_test)

# 2. Log Transform (Menghindari nol dengan log(x+1))
train_model(FunctionTransformer(np.log1p, validate=True), "Log Transform", X_train, X_test)

# 3. Box-Cox Transform (Memastikan semua nilai positif sebelum transformasi)
X_train_boxcox = X_train + abs(X_train.min()) + 1  # Menjadikan semua nilai positif
X_test_boxcox = X_test + abs(X_train.min()) + 1  # Pastikan transformasi sama seperti train

train_model(PowerTransformer(method='box-cox'), "Box-Cox", X_train_boxcox, X_test_boxcox)

# 4. Quantile Transform (Distribusi normal)
train_model(QuantileTransformer(output_distribution='normal', random_state=42), "Quantile Transform", X_train, X_test)

# Menampilkan hasil
print(results)


{'Yeo-Johnson': 19.69075058360577, '\nLog Transform': 18.466623716585616, '\nBox-Cox': 19.835517360226927, '\nQuantile Transform': 19.202839569582025}




## **2. Normalisasi dan Standardisasi Data**

In [6]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Load dataset Boston Housing dari file CSV
file_path = "https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv"
df = pd.read_csv(file_path)

# Pisahkan fitur dan target
X = df.drop(columns=["medv"])  # "medv" adalah harga rumah (target)
y = df["medv"]

# Fungsi untuk menerapkan scaling dan mengevaluasi model
def evaluate_scaling(scaler):
    X_scaled = scaler.fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    return mse

# Evaluasi dengan berbagai metode scaling
mse_results = {
    "MinMax Scaling": evaluate_scaling(MinMaxScaler()),
    "Standard Scaling": evaluate_scaling(StandardScaler()),
    "Robust Scaling": evaluate_scaling(RobustScaler()),
}

# Menampilkan hasil MSE untuk setiap metode scaling
print("Hasil MSE untuk setiap metode scaling:")
for method, mse in mse_results.items():
    print(f"{method}: {mse:.4f}")


Hasil MSE untuk setiap metode scaling:
MinMax Scaling: 24.2911
Standard Scaling: 24.2911
Robust Scaling: 24.2911


## **3. Discretization pada Fitur Numerik**

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, KBinsDiscretizer

# Load dataset
file_path = "https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv"
df = pd.read_csv(file_path)

# Pisahkan fitur numerik dan target
numerical_features = df.select_dtypes(include=[np.number]).columns.tolist()
numerical_features.remove("medv")  # Target tidak diubah

# Split dataset untuk model dengan fitur kontinu
X_cont = df.drop(columns=["medv"])
y = df["medv"]
X_train_cont, X_test_cont, y_train, y_test = train_test_split(X_cont, y, test_size=0.2, random_state=42)

# Normalisasi fitur numerik kontinu
scaler = StandardScaler()
X_train_cont[numerical_features] = scaler.fit_transform(X_train_cont[numerical_features])
X_test_cont[numerical_features] = scaler.transform(X_test_cont[numerical_features])

# Training model dengan fitur kontinu
model_cont = LinearRegression()
model_cont.fit(X_train_cont, y_train)
y_pred_cont = model_cont.predict(X_test_cont)
mse_cont = mean_squared_error(y_test, y_pred_cont)

# Discretization menggunakan KBinsDiscretizer
discretizer = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')
X_disc = df.copy()
X_disc[numerical_features] = discretizer.fit_transform(X_disc[numerical_features])

# Split dataset untuk model dengan fitur diskret
X_train_disc, X_test_disc, y_train_disc, y_test_disc = train_test_split(
    X_disc.drop(columns=["medv"]), y, test_size=0.2, random_state=42
)

# Training model dengan fitur diskret
model_disc = LinearRegression()
model_disc.fit(X_train_disc, y_train_disc)
y_pred_disc = model_disc.predict(X_test_disc)
mse_disc = mean_squared_error(y_test_disc, y_pred_disc)

# Cetak hasil perbandingan
print("MSE dengan fitur kontinu:", mse_cont)
print("MSE dengan fitur diskret:", mse_disc)


MSE dengan fitur kontinu: 24.291119474973513
MSE dengan fitur diskret: 27.965770383923992
