In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

# Load data
file_path = 'RegresiUTSTelkom.csv'  # Ganti dengan path file Anda
data = pd.read_csv(file_path, header=None)

# Asumsi: Kolom pertama adalah target, sisanya adalah fitur
X = data.iloc[:, 1:].values
y = data.iloc[:, 0].values

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Models pipeline
models = {
    "Polynomial Regression": Pipeline([
        ("scaler", StandardScaler()),  # Normalisasi data
        ("poly_features", PolynomialFeatures(degree=2, include_bias=False)),  # Basis fungsi
        ("regressor", LinearRegression())
    ]),
    "Decision Tree": Pipeline([
        ("regressor", DecisionTreeRegressor(max_depth=5, random_state=42))  # Max depth untuk kontrol kompleksitas
    ]),
    "k-NN": Pipeline([
        ("scaler", StandardScaler()),  # Normalisasi data
        ("regressor", KNeighborsRegressor(n_neighbors=5))  # k=5
    ]),
    "XGBoost": Pipeline([
        ("regressor", XGBRegressor(n_estimators=100, max_depth=5, random_state=42, verbosity=0))  # Konfigurasi default
    ])
}

# Evaluate models
for model_name, pipeline in models.items():
    print(f"\nEvaluating {model_name}...")
    # Train the model
    pipeline.fit(X_train, y_train)
    # Predict
    y_pred = pipeline.predict(X_test)
    # Evaluate
    mse = mean_squared_error(y_test, y_pred)
    print(f"Mean Squared Error (MSE): {mse:.4f}")



Evaluating Polynomial Regression...


In [3]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import SelectKBest, f_regression
import numpy as np
import pandas as pd

# Load data
file_path = 'RegresiUTSTelkom.csv'  # Ganti dengan path file Anda
data = pd.read_csv(file_path, header=None)

# Asumsi: Kolom pertama adalah target, sisanya adalah fitur
X = data.iloc[:, 1:].values
y = data.iloc[:, 0].values

# Step 1: Subsampling untuk hyperparameter tuning
X_sample, _, y_sample, _ = train_test_split(X, y, test_size=0.9, random_state=42)

# Step 2: Split data untuk pelatihan dan pengujian
X_train, X_test, y_train, y_test = train_test_split(X_sample, y_sample, test_size=0.2, random_state=42)

# Step 3: Seleksi fitur (jika diperlukan, gunakan k fitur terpenting)
# X_train = SelectKBest(score_func=f_regression, k=50).fit_transform(X_train, y_train)
# X_test = SelectKBest(score_func=f_regression, k=50).transform(X_test)

# Models and hyperparameter tuning configurations
models = {
    "Polynomial Regression": Pipeline([
        ("poly_features", PolynomialFeatures(degree=2, include_bias=False)),
        ("regressor", LinearRegression())
    ]),
    "Decision Tree": Pipeline([
        ("regressor", DecisionTreeRegressor(random_state=42))
    ]),
    "k-NN": Pipeline([
        ("scaler", StandardScaler()),
        ("regressor", KNeighborsRegressor())
    ]),
    "XGBoost": Pipeline([
        ("regressor", XGBRegressor(random_state=42, verbosity=0))
    ])
}

param_grids = {
    "Polynomial Regression": {
        "poly_features__degree": [2, 3],  # Batasi derajat polinomial
        "regressor__fit_intercept": [True]
    },
    "Decision Tree": {
        "regressor__max_depth": [3, 5, 10],
        "regressor__min_samples_split": [2, 5]
    },
    "k-NN": {
        "regressor__n_neighbors": [3, 5],
        "regressor__weights": ["uniform", "distance"]
    },
    "XGBoost": {
        "regressor__n_estimators": [50, 100],
        "regressor__max_depth": [3, 5],
        "regressor__learning_rate": [0.01, 0.1]
    }
}

# Results dictionary
results = {}

# Hyperparameter tuning
for model_name, pipeline in models.items():
    print(f"\nTuning hyperparameters for {model_name}...")
    
    # Create GridSearchCV object
    grid_search = GridSearchCV(
        pipeline,
        param_grid=param_grids[model_name],
        scoring='neg_mean_squared_error',
        cv=2,  # 2-fold CV untuk menghemat waktu
        n_jobs=-1,
        verbose=1
    )
    
    # Fit with subset
    grid_search.fit(X_train, y_train)
    
    # Best model and performance
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_
    
    # Evaluate on the full test set
    y_pred = best_model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    
    # Store results
    results[model_name] = {
        "Best Parameters": best_params,
        "Test MSE": mse
    }

# Print a summary of results
print("\nHyperparameter Tuning Results:")
for model_name, result in results.items():
    print(f"\nModel: {model_name}")
    print(f"Best Parameters: {result['Best Parameters']}")
    print(f"Test Mean Squared Error: {result['Test MSE']:.4f}")



Tuning hyperparameters for Polynomial Regression...
Fitting 2 folds for each of 2 candidates, totalling 4 fits


KeyboardInterrupt: 