# ML600 Exercises: Optimization, Regularization & Model Selection

These exercises cover gradient descent from scratch, bias-variance tradeoff visualization,
hyperparameter tuning with GridSearchCV, recursive feature elimination, and an end-to-end
mini-project.

**Difficulty increases with each exercise.**

In [None]:
# ============================================================
# Setup: Run this cell first
# ============================================================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import (
    train_test_split, cross_val_score, GridSearchCV
)
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFE
from sklearn.metrics import (
    mean_squared_error, r2_score, accuracy_score, classification_report
)
from sklearn.datasets import (
    load_diabetes, load_breast_cancer, make_classification
)
import joblib

np.random.seed(42)
RANDOM_STATE = 42

print("Setup complete.")

---
## Exercise 1: Gradient Descent from Scratch

**Goal:** Implement gradient descent for simple 1D linear regression
(y = w*x + b) from scratch and plot the cost function over iterations.

**Tasks:**
1. Generate synthetic 1D data: y = 3*x + 5 + noise.
2. Implement the gradient descent update rules for w and b.
3. Run gradient descent for 200 iterations with learning_rate = 0.01.
4. Track the MSE cost at each iteration.
5. Plot cost vs. iterations and the final regression line.

In [None]:
# Exercise 1 - Starter Code

import numpy as np
import matplotlib.pyplot as plt

np.random.seed(42)

# Generate synthetic data: y = 3x + 5 + noise
n_samples = 100
X = 2 * np.random.rand(n_samples)  # values between 0 and 2
y_true = 3 * X + 5 + np.random.randn(n_samples) * 0.5

# Initialize parameters
w = 0.0   # weight (slope)
b = 0.0   # bias (intercept)
learning_rate = 0.01
n_iterations = 200

# TODO 1: Implement gradient descent
# costs = []
# for i in range(n_iterations):
#     # Forward pass: predictions
#     y_pred = w * X + b
#
#     # Compute cost (MSE)
#     cost = np.mean((y_pred - y_true) ** 2)
#     costs.append(cost)
#
#     # Compute gradients
#     dw = (2 / n_samples) * np.sum((y_pred - y_true) * X)
#     db = (2 / n_samples) * np.sum(y_pred - y_true)
#
#     # Update parameters
#     w = w - learning_rate * dw
#     b = b - learning_rate * db
#
#     if (i + 1) % 50 == 0:
#         print(f"Iteration {i+1:3d}: cost={cost:.4f}, w={w:.4f}, b={b:.4f}")

# TODO 2: Print final parameters
# print(f"\nFinal: w={w:.4f} (true=3.0), b={b:.4f} (true=5.0)")

# TODO 3: Plot cost vs iterations
# fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# axes[0].plot(costs)
# axes[0].set_xlabel('Iteration')
# axes[0].set_ylabel('MSE Cost')
# axes[0].set_title('Cost vs Iterations')

# TODO 4: Plot data with final regression line
# axes[1].scatter(X, y_true, alpha=0.5, label='Data')
# X_line = np.linspace(0, 2, 100)
# axes[1].plot(X_line, w * X_line + b, 'r-', label=f'y = {w:.2f}x + {b:.2f}', lw=2)
# axes[1].set_xlabel('X')
# axes[1].set_ylabel('y')
# axes[1].set_title('Linear Regression (Gradient Descent)')
# axes[1].legend()
# plt.tight_layout()
# plt.show()

### Solution

<details><summary>Click to reveal solution</summary>

```python
import numpy as np
import matplotlib.pyplot as plt

np.random.seed(42)

# Generate data
n_samples = 100
X = 2 * np.random.rand(n_samples)
y_true = 3 * X + 5 + np.random.randn(n_samples) * 0.5

# Initialize
w = 0.0
b = 0.0
learning_rate = 0.01
n_iterations = 200

# 1. Gradient descent
costs = []
for i in range(n_iterations):
    y_pred = w * X + b
    cost = np.mean((y_pred - y_true) ** 2)
    costs.append(cost)

    dw = (2 / n_samples) * np.sum((y_pred - y_true) * X)
    db = (2 / n_samples) * np.sum(y_pred - y_true)

    w = w - learning_rate * dw
    b = b - learning_rate * db

    if (i + 1) % 50 == 0:
        print(f"Iteration {i+1:3d}: cost={cost:.4f}, w={w:.4f}, b={b:.4f}")

# 2. Final parameters
print(f"\nFinal: w={w:.4f} (true=3.0), b={b:.4f} (true=5.0)")

# 3 & 4. Plots
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].plot(costs)
axes[0].set_xlabel('Iteration')
axes[0].set_ylabel('MSE Cost')
axes[0].set_title('Cost vs Iterations')

axes[1].scatter(X, y_true, alpha=0.5, label='Data')
X_line = np.linspace(0, 2, 100)
axes[1].plot(X_line, w * X_line + b, 'r-', label=f'y = {w:.2f}x + {b:.2f}', lw=2)
axes[1].set_xlabel('X')
axes[1].set_ylabel('y')
axes[1].set_title('Linear Regression (Gradient Descent)')
axes[1].legend()

plt.tight_layout()
plt.show()
```

</details>

---
## Exercise 2: Bias-Variance Tradeoff Visualization

**Goal:** Fit polynomials of increasing degree to a noisy dataset and visualize
the bias-variance tradeoff by plotting train and test error.

**Tasks:**
1. Generate synthetic data: y = sin(2*pi*x) + noise.
2. Split into train/test.
3. Fit polynomial regression of degrees 1, 3, 5, 9, 15.
4. Compute train and test MSE for each degree.
5. Plot train/test error vs. polynomial degree.

In [None]:
# Exercise 2 - Starter Code

import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

np.random.seed(42)

# Generate noisy sine data
n_samples = 100
X = np.sort(np.random.uniform(0, 1, n_samples)).reshape(-1, 1)
y = np.sin(2 * np.pi * X).ravel() + np.random.randn(n_samples) * 0.3

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

degrees = [1, 3, 5, 9, 15]

# TODO 1: For each degree, fit polynomial regression and compute train/test MSE
# train_errors = []
# test_errors = []
# for degree in degrees:
#     poly = PolynomialFeatures(degree=degree, include_bias=False)
#     X_train_poly = poly.fit_transform(X_train)
#     X_test_poly = poly.transform(X_test)
#
#     model = LinearRegression()
#     model.fit(X_train_poly, y_train)
#
#     train_mse = mean_squared_error(y_train, model.predict(X_train_poly))
#     test_mse = mean_squared_error(y_test, model.predict(X_test_poly))
#     train_errors.append(train_mse)
#     test_errors.append(test_mse)
#     print(f"Degree {degree:2d}: Train MSE = {train_mse:.4f}, Test MSE = {test_mse:.4f}")

# TODO 2: Plot train vs test error
# plt.figure(figsize=(8, 5))
# plt.plot(degrees, train_errors, 'o-', label='Train MSE')
# plt.plot(degrees, test_errors, 's-', label='Test MSE')
# plt.xlabel('Polynomial Degree')
# plt.ylabel('Mean Squared Error')
# plt.title('Bias-Variance Tradeoff')
# plt.legend()
# plt.xticks(degrees)
# plt.tight_layout()
# plt.show()

# TODO 3: Plot the fitted curves for each degree
# fig, axes = plt.subplots(1, len(degrees), figsize=(20, 4))
# X_plot = np.linspace(0, 1, 200).reshape(-1, 1)
# for idx, degree in enumerate(degrees):
#     poly = PolynomialFeatures(degree=degree, include_bias=False)
#     X_train_poly = poly.fit_transform(X_train)
#     X_plot_poly = poly.transform(X_plot)
#     model = LinearRegression()
#     model.fit(X_train_poly, y_train)
#     y_plot = model.predict(X_plot_poly)
#
#     axes[idx].scatter(X_train, y_train, s=10, alpha=0.5)
#     axes[idx].plot(X_plot, y_plot, 'r-', lw=2)
#     axes[idx].plot(X_plot, np.sin(2 * np.pi * X_plot), 'g--', lw=1, alpha=0.5)
#     axes[idx].set_title(f'Degree {degree}')
#     axes[idx].set_ylim(-2, 2)
# plt.tight_layout()
# plt.show()

### Solution

<details><summary>Click to reveal solution</summary>

```python
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

np.random.seed(42)

n_samples = 100
X = np.sort(np.random.uniform(0, 1, n_samples)).reshape(-1, 1)
y = np.sin(2 * np.pi * X).ravel() + np.random.randn(n_samples) * 0.3

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

degrees = [1, 3, 5, 9, 15]

# 1. Fit and evaluate
train_errors = []
test_errors = []
for degree in degrees:
    poly = PolynomialFeatures(degree=degree, include_bias=False)
    X_train_poly = poly.fit_transform(X_train)
    X_test_poly = poly.transform(X_test)

    model = LinearRegression()
    model.fit(X_train_poly, y_train)

    train_mse = mean_squared_error(y_train, model.predict(X_train_poly))
    test_mse = mean_squared_error(y_test, model.predict(X_test_poly))
    train_errors.append(train_mse)
    test_errors.append(test_mse)
    print(f"Degree {degree:2d}: Train MSE = {train_mse:.4f}, Test MSE = {test_mse:.4f}")

# 2. Error plot
plt.figure(figsize=(8, 5))
plt.plot(degrees, train_errors, 'o-', label='Train MSE')
plt.plot(degrees, test_errors, 's-', label='Test MSE')
plt.xlabel('Polynomial Degree')
plt.ylabel('Mean Squared Error')
plt.title('Bias-Variance Tradeoff')
plt.legend()
plt.xticks(degrees)
plt.tight_layout()
plt.show()

# 3. Fitted curves
fig, axes = plt.subplots(1, len(degrees), figsize=(20, 4))
X_plot = np.linspace(0, 1, 200).reshape(-1, 1)
for idx, degree in enumerate(degrees):
    poly = PolynomialFeatures(degree=degree, include_bias=False)
    X_train_poly = poly.fit_transform(X_train)
    X_plot_poly = poly.transform(X_plot)
    model = LinearRegression()
    model.fit(X_train_poly, y_train)
    y_plot = model.predict(X_plot_poly)

    axes[idx].scatter(X_train, y_train, s=10, alpha=0.5)
    axes[idx].plot(X_plot, y_plot, 'r-', lw=2)
    axes[idx].plot(X_plot, np.sin(2 * np.pi * X_plot), 'g--', lw=1, alpha=0.5)
    axes[idx].set_title(f'Degree {degree}')
    axes[idx].set_ylim(-2, 2)
plt.tight_layout()
plt.show()

# Observations:
# - Degree 1: High bias (underfitting) -- cannot capture the sine wave.
# - Degree 3-5: Good fit -- captures the pattern without overfitting.
# - Degree 9-15: High variance (overfitting) -- fits noise in training data.
```

</details>

---
## Exercise 3: GridSearchCV for Random Forest Tuning

**Goal:** Use `GridSearchCV` to find the best hyperparameters for a Random Forest
classifier.

**Tasks:**
1. Load the breast cancer dataset.
2. Define a parameter grid for `n_estimators` and `max_depth`.
3. Run `GridSearchCV` with 5-fold cross-validation.
4. Report the best parameters and best score.
5. Evaluate the best model on the test set.

In [None]:
# Exercise 3 - Starter Code

from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

data = load_breast_cancer()
X, y = data.data, data.target
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# TODO 1: Define parameter grid
# param_grid = {
#     'n_estimators': [50, 100, 200],
#     'max_depth': [3, 5, 10, None]
# }

# TODO 2: Create GridSearchCV
# rf = RandomForestClassifier(random_state=42)
# grid_search = GridSearchCV(
#     rf, param_grid, cv=5, scoring='accuracy',
#     verbose=1, n_jobs=-1
# )

# TODO 3: Fit GridSearchCV
# grid_search.fit(X_train, y_train)

# TODO 4: Report best parameters and score
# print(f"Best parameters: {grid_search.best_params_}")
# print(f"Best CV score: {grid_search.best_score_:.4f}")

# TODO 5: Evaluate on test set
# best_model = grid_search.best_estimator_
# test_acc = accuracy_score(y_test, best_model.predict(X_test))
# print(f"Test accuracy: {test_acc:.4f}")

# TODO 6 (Bonus): Display results as a DataFrame
# results_df = pd.DataFrame(grid_search.cv_results_)
# print(results_df[['param_n_estimators', 'param_max_depth', 'mean_test_score', 'std_test_score']]
#       .sort_values('mean_test_score', ascending=False)
#       .head(10)
#       .to_string(index=False))

### Solution

<details><summary>Click to reveal solution</summary>

```python
from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

data = load_breast_cancer()
X, y = data.data, data.target
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 1. Parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 10, None]
}

# 2. GridSearchCV
rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(
    rf, param_grid, cv=5, scoring='accuracy',
    verbose=1, n_jobs=-1
)

# 3. Fit
grid_search.fit(X_train, y_train)

# 4. Best results
print(f"\nBest parameters: {grid_search.best_params_}")
print(f"Best CV score: {grid_search.best_score_:.4f}")

# 5. Test evaluation
best_model = grid_search.best_estimator_
test_acc = accuracy_score(y_test, best_model.predict(X_test))
print(f"Test accuracy: {test_acc:.4f}")

# 6. Results table
results_df = pd.DataFrame(grid_search.cv_results_)
print("\nAll results (sorted by score):")
print(results_df[['param_n_estimators', 'param_max_depth', 'mean_test_score', 'std_test_score']]
      .sort_values('mean_test_score', ascending=False)
      .head(10)
      .to_string(index=False))
```

</details>

---
## Exercise 4: Recursive Feature Elimination (RFE)

**Goal:** Use RFE with `LogisticRegression` to select the top 5 features
from a 15-feature dataset.

**Tasks:**
1. Generate a classification dataset with 15 features (8 informative).
2. Apply `RFE` with `LogisticRegression` to select top 5 features.
3. Print which features were selected and their rankings.
4. Compare model accuracy with all features vs. selected features.
5. Visualize feature rankings.

In [None]:
# Exercise 4 - Starter Code

from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split, cross_val_score

# Generate dataset with 15 features, 8 informative
X, y = make_classification(
    n_samples=500, n_features=15, n_informative=8,
    n_redundant=3, n_classes=2, random_state=42
)
feature_names = [f'feature_{i}' for i in range(15)]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# TODO 1: Create RFE with LogisticRegression, select top 5
# estimator = LogisticRegression(max_iter=1000, random_state=42)
# rfe = RFE(estimator, n_features_to_select=5)
# rfe.fit(X_train, y_train)

# TODO 2: Print selected features and rankings
# for name, selected, rank in zip(feature_names, rfe.support_, rfe.ranking_):
#     status = 'SELECTED' if selected else f'rank {rank}'
#     print(f"{name}: {status}")

# TODO 3: Compare accuracy: all features vs selected features
# all_scores = cross_val_score(
#     LogisticRegression(max_iter=1000, random_state=42),
#     X_train, y_train, cv=5, scoring='accuracy'
# )
# rfe_scores = cross_val_score(
#     LogisticRegression(max_iter=1000, random_state=42),
#     X_train[:, rfe.support_], y_train, cv=5, scoring='accuracy'
# )
# print(f"\nAll features ({X.shape[1]}): {all_scores.mean():.4f} +/- {all_scores.std():.4f}")
# print(f"RFE features (5):   {rfe_scores.mean():.4f} +/- {rfe_scores.std():.4f}")

# TODO 4: Visualize rankings
# plt.figure(figsize=(10, 5))
# plt.barh(feature_names, rfe.ranking_)
# plt.xlabel('RFE Ranking (1 = selected)')
# plt.title('Feature Rankings by RFE')
# plt.tight_layout()
# plt.show()

### Solution

<details><summary>Click to reveal solution</summary>

```python
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split, cross_val_score

X, y = make_classification(
    n_samples=500, n_features=15, n_informative=8,
    n_redundant=3, n_classes=2, random_state=42
)
feature_names = [f'feature_{i}' for i in range(15)]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 1. RFE
estimator = LogisticRegression(max_iter=1000, random_state=42)
rfe = RFE(estimator, n_features_to_select=5)
rfe.fit(X_train, y_train)

# 2. Print features
print("Feature Selection Results:")
for name, selected, rank in zip(feature_names, rfe.support_, rfe.ranking_):
    status = 'SELECTED' if selected else f'rank {rank}'
    print(f"  {name}: {status}")

selected_features = [name for name, sel in zip(feature_names, rfe.support_) if sel]
print(f"\nSelected features: {selected_features}")

# 3. Compare accuracy
all_scores = cross_val_score(
    LogisticRegression(max_iter=1000, random_state=42),
    X_train, y_train, cv=5, scoring='accuracy'
)
rfe_scores = cross_val_score(
    LogisticRegression(max_iter=1000, random_state=42),
    X_train[:, rfe.support_], y_train, cv=5, scoring='accuracy'
)
print(f"\nAll features ({X.shape[1]}): {all_scores.mean():.4f} +/- {all_scores.std():.4f}")
print(f"RFE features (5):   {rfe_scores.mean():.4f} +/- {rfe_scores.std():.4f}")

# 4. Visualize
plt.figure(figsize=(10, 5))
colors = ['green' if s else 'gray' for s in rfe.support_]
plt.barh(feature_names, rfe.ranking_, color=colors)
plt.xlabel('RFE Ranking (1 = selected)')
plt.title('Feature Rankings by RFE')
plt.tight_layout()
plt.show()
```

</details>

---
## Exercise 5: End-to-End Mini-Project

**Goal:** Complete an end-to-end ML workflow: load data, explore, split, create
a baseline, build a pipeline, evaluate with cross-validation, and save the model.

**Tasks:**
1. Load the diabetes dataset and explore it briefly.
2. Split into train/test (80/20).
3. Create a baseline model (predict the mean of y_train).
4. Build a pipeline: StandardScaler + Ridge regression.
5. Evaluate with 5-fold CV, report R2, compare to baseline, and save the model.

In [None]:
# Exercise 5 - Starter Code

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
import joblib

# ---- STEP 1: Load and Explore ----
diabetes = load_diabetes(as_frame=True)
df = diabetes.frame
print("Dataset shape:", df.shape)
print("\nFirst 5 rows:")
print(df.head())
print("\nDescription:")
print(df.describe())

X = diabetes.data
y = diabetes.target

# TODO 2: Split 80/20
# X_train, X_test, y_train, y_test = train_test_split(
#     X, y, test_size=0.2, random_state=42
# )
# print(f"\nTrain size: {len(X_train)}, Test size: {len(X_test)}")

# TODO 3: Baseline model (predict the mean)
# y_baseline = np.full_like(y_test, y_train.mean())
# baseline_mse = mean_squared_error(y_test, y_baseline)
# baseline_r2 = r2_score(y_test, y_baseline)
# print(f"\nBaseline (predict mean):")
# print(f"  MSE: {baseline_mse:.2f}")
# print(f"  R2:  {baseline_r2:.4f}")

# TODO 4: Build pipeline (StandardScaler + Ridge)
# pipeline = Pipeline([
#     ('scaler', StandardScaler()),
#     ('ridge', Ridge(alpha=1.0))
# ])

# TODO 5a: Cross-validation
# cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='r2')
# print(f"\nPipeline (5-fold CV):")
# print(f"  R2: {cv_scores.mean():.4f} +/- {cv_scores.std():.4f}")

# TODO 5b: Fit on full training set and evaluate on test
# pipeline.fit(X_train, y_train)
# y_pred = pipeline.predict(X_test)
# test_mse = mean_squared_error(y_test, y_pred)
# test_r2 = r2_score(y_test, y_pred)
# print(f"\nTest set evaluation:")
# print(f"  MSE: {test_mse:.2f}")
# print(f"  R2:  {test_r2:.4f}")
# print(f"  Improvement over baseline: {test_r2 - baseline_r2:.4f} R2")

# TODO 5c: Save the model
# joblib.dump(pipeline, 'diabetes_ridge_pipeline.pkl')
# print("\nModel saved to diabetes_ridge_pipeline.pkl")

# TODO 5d: Verify saved model
# loaded_model = joblib.load('diabetes_ridge_pipeline.pkl')
# verify_r2 = r2_score(y_test, loaded_model.predict(X_test))
# print(f"Loaded model R2: {verify_r2:.4f} (should match {test_r2:.4f})")

### Solution

<details><summary>Click to reveal solution</summary>

```python
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
import joblib

# 1. Load and explore
diabetes = load_diabetes(as_frame=True)
df = diabetes.frame
print("Dataset shape:", df.shape)
print("\nFirst 5 rows:")
print(df.head())

X = diabetes.data
y = diabetes.target

# 2. Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print(f"\nTrain size: {len(X_train)}, Test size: {len(X_test)}")

# 3. Baseline
y_baseline = np.full_like(y_test, y_train.mean())
baseline_mse = mean_squared_error(y_test, y_baseline)
baseline_r2 = r2_score(y_test, y_baseline)
print(f"\nBaseline (predict mean):")
print(f"  MSE: {baseline_mse:.2f}")
print(f"  R2:  {baseline_r2:.4f}")

# 4. Pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('ridge', Ridge(alpha=1.0))
])

# 5a. Cross-validation
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='r2')
print(f"\nPipeline (5-fold CV):")
print(f"  R2: {cv_scores.mean():.4f} +/- {cv_scores.std():.4f}")

# 5b. Test evaluation
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
test_mse = mean_squared_error(y_test, y_pred)
test_r2 = r2_score(y_test, y_pred)
print(f"\nTest set evaluation:")
print(f"  MSE: {test_mse:.2f}")
print(f"  R2:  {test_r2:.4f}")
print(f"  Improvement over baseline: {test_r2 - baseline_r2:.4f} R2")

# Plot actual vs predicted
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title(f'Diabetes Prediction (R2={test_r2:.4f})')
plt.tight_layout()
plt.show()

# 5c. Save
joblib.dump(pipeline, 'diabetes_ridge_pipeline.pkl')
print("\nModel saved to diabetes_ridge_pipeline.pkl")

# 5d. Verify
loaded_model = joblib.load('diabetes_ridge_pipeline.pkl')
verify_r2 = r2_score(y_test, loaded_model.predict(X_test))
print(f"Loaded model R2: {verify_r2:.4f} (should match {test_r2:.4f})")
```

</details>