In [1]:
# --- Import Libraries ---
import numpy as np
import pandas as pd

from sklearn.datasets import load_diabetes   # built-in regression dataset
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import r2_score

# --- Step 1: Load Built-in Dataset ---
data = load_diabetes()
X = data.data
y = data.target

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# --- Step 2: Create Pipeline for Polynomial Regression ---
# Pipeline: PolynomialFeatures -> StandardScaler -> Ridge Regression
pipeline = Pipeline([
    ("poly", PolynomialFeatures()),       # polynomial feature transformation
    ("scaler", StandardScaler()),         # scaling features
    ("ridge", Ridge())                    # regression with regularization
])

# --- Step 3: Define Hyperparameter Grid ---
param_grid = {
    "poly__degree": [1, 2, 3, 4],          # degree of polynomial features
    "ridge__alpha": [0.1, 1, 10, 100]      # regularization strength
}

# --- Step 4: Apply GridSearchCV ---
grid = GridSearchCV(
    pipeline,
    param_grid=param_grid,
    cv=5,                   # 5-fold cross-validation
    scoring="r2",           # R² score (good for regression)
    n_jobs=-1,
    verbose=1
)

grid.fit(X_train, y_train)

# --- Step 5: Best Parameters & Best Model ---
print("Best Parameters:", grid.best_params_)
print("Best CV R² Score:", grid.best_score_)

# --- Step 6: Evaluate on Test Data ---
y_pred = grid.predict(X_test)
r2 = r2_score(y_test, y_pred)
print("Test R² Score:", r2)


Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best Parameters: {'poly__degree': 1, 'ridge__alpha': 10}
Best CV R² Score: 0.453937415795935
Test R² Score: 0.45721095677808476
