In [1]:
import gdown
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder
from sklearn.feature_selection import (
    VarianceThreshold,
    SelectKBest,
    mutual_info_regression,
    f_regression,
)
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import (
    BaggingRegressor,
    AdaBoostRegressor,
    GradientBoostingRegressor,
)
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
# Load data
data = pd.read_csv("RegresiUTSTelkom.csv")

In [3]:
# Hapus duplikat baris
data = data.drop_duplicates()
# Handle missing values
data = data.fillna(data.mean())

In [4]:
#Feature engineering
X = data.drop(columns=["2001"]) 
y = data["2001"]

constant_filter = VarianceThreshold(threshold=0)
constant_filter.fit(X)
constant_columns = X.columns[~constant_filter.get_support()]
X = X.drop(columns=constant_columns)

quasi_constant_filter = VarianceThreshold(threshold=0.01)
quasi_constant_filter.fit(X)
quasi_constant_columns = X.columns[~quasi_constant_filter.get_support()]
X = X.drop(columns=quasi_constant_columns)

#Hapues filter duplikat
X = X.T.drop_duplicates().T

corr_matrix = X.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]
X = X.drop(columns=to_drop)


In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Transformasi numerik
numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="mean")),
        ("scaler", StandardScaler()),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, X.columns),
    ]
)

In [None]:
models = {
    "Linear Regression": LinearRegression(),
    "Polynomial Regression": Pipeline(
        [
            ("poly", PolynomialFeatures(degree=2)),
            ("linear", LinearRegression()),
        ]
    ),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "KNN": KNeighborsRegressor(),
    "Bagging": BaggingRegressor(random_state=42),
    "AdaBoost": AdaBoostRegressor(random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42),
    "SVR": SVR(),
}

results = {}
for name, model in models.items():
    pipeline = Pipeline(
        steps=[
            ("preprocessor", preprocessor),
            ("feature_selection", SelectKBest(score_func=mutual_info_regression, k=10)),
            ("model", model),
        ]
    )

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    results[name] = {
        "MSE": round(mse, 4),
        "R2 Score": round(r2, 4),
    }

# Tampilkan hasil
results_df = pd.DataFrame(results).T
print("\nHasil Evaluasi Model:")
print(results_df)