# ML200 Exercises: Linear Regression

These exercises cover linear regression fundamentals: simple and multiple regression,
assumption checking, regularization (Ridge, Lasso, ElasticNet), and pipeline construction.

**Difficulty increases with each exercise.**

In [None]:
# ============================================================
# Setup: Run this cell first
# ============================================================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import (
    LinearRegression, Ridge, Lasso, ElasticNet, LassoCV
)
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.datasets import fetch_california_housing, make_regression
from sklearn.metrics import r2_score, mean_squared_error

np.random.seed(42)
RANDOM_STATE = 42

print("Setup complete.")

---
## Exercise 1: Simple Linear Regression on California Housing

**Goal:** Fit a linear regression using 2 features from the California Housing
dataset, report R-squared, and create an actual vs. predicted scatter plot.

**Tasks:**
1. Load California Housing and select `MedInc` and `AveRooms` as features.
2. Perform an 80/20 split (`random_state=42`).
3. Fit a `LinearRegression` model.
4. Compute R-squared on the test set.
5. Create a scatter plot of actual vs. predicted values with a diagonal reference line.

In [None]:
# Exercise 1 - Starter Code

from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

housing = fetch_california_housing(as_frame=True)
df = housing.frame
print("Available columns:", df.columns.tolist())

# TODO 1: Select MedInc and AveRooms as features
# X = df[['MedInc', 'AveRooms']]
# y = df['MedHouseVal']

# TODO 2: 80/20 split
# X_train, X_test, y_train, y_test = ...

# TODO 3: Fit LinearRegression
# model = LinearRegression()
# model.fit(...)

# TODO 4: Predict and compute R2
# y_pred = model.predict(X_test)
# r2 = r2_score(y_test, y_pred)
# print(f"R-squared: {r2:.4f}")

# TODO 5: Plot actual vs predicted
# plt.figure(figsize=(8, 6))
# plt.scatter(y_test, y_pred, alpha=0.3)
# plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
# plt.xlabel('Actual')
# plt.ylabel('Predicted')
# plt.title('Actual vs Predicted House Values')
# plt.tight_layout()
# plt.show()

### Solution

<details><summary>Click to reveal solution</summary>

```python
from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

housing = fetch_california_housing(as_frame=True)
df = housing.frame

# 1. Select features
X = df[['MedInc', 'AveRooms']]
y = df['MedHouseVal']

# 2. Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 3. Fit
model = LinearRegression()
model.fit(X_train, y_train)

# 4. Evaluate
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
print(f"R-squared: {r2:.4f}")
print(f"Coefficients: {dict(zip(X.columns, model.coef_))}")
print(f"Intercept: {model.intercept_:.4f}")

# 5. Plot
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, alpha=0.3)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Actual vs Predicted House Values')
plt.tight_layout()
plt.show()
```

</details>

---
## Exercise 2: Check Linear Regression Assumptions

**Goal:** After fitting a linear regression, create diagnostic plots to check
the assumptions of linearity and normality of residuals.

**Tasks:**
1. Fit a linear regression on the California Housing dataset (all features).
2. Compute residuals (`y_test - y_pred`).
3. Create a residual plot (predicted values vs. residuals).
4. Create a Q-Q plot of the residuals using `scipy.stats.probplot`.
5. Interpret the plots in a comment.

In [None]:
# Exercise 2 - Starter Code

from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import scipy.stats as stats

housing = fetch_california_housing(as_frame=True)
X = housing.data
y = housing.target

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# TODO 1: Compute residuals
# residuals = ...

# TODO 2: Create residual plot (predicted vs residuals)
# fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# axes[0].scatter(y_pred, residuals, alpha=0.3)
# axes[0].axhline(y=0, color='r', linestyle='--')
# axes[0].set_xlabel('Predicted Values')
# axes[0].set_ylabel('Residuals')
# axes[0].set_title('Residual Plot')

# TODO 3: Create Q-Q plot
# stats.probplot(residuals, dist='norm', plot=axes[1])
# axes[1].set_title('Q-Q Plot of Residuals')

# plt.tight_layout()
# plt.show()

# TODO 4: Interpretation (write in a comment)
# Your interpretation: ...

### Solution

<details><summary>Click to reveal solution</summary>

```python
from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import scipy.stats as stats

housing = fetch_california_housing(as_frame=True)
X = housing.data
y = housing.target

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# 1. Compute residuals
residuals = y_test - y_pred

# 2 & 3. Diagnostic plots
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Residual plot
axes[0].scatter(y_pred, residuals, alpha=0.3)
axes[0].axhline(y=0, color='r', linestyle='--')
axes[0].set_xlabel('Predicted Values')
axes[0].set_ylabel('Residuals')
axes[0].set_title('Residual Plot')

# Q-Q plot
stats.probplot(residuals, dist='norm', plot=axes[1])
axes[1].set_title('Q-Q Plot of Residuals')

plt.tight_layout()
plt.show()

# 4. Interpretation:
# - The residual plot shows a pattern (non-random scatter), suggesting
#   the linear model does not fully capture the relationship.
# - The Q-Q plot shows deviations from the diagonal in the tails,
#   indicating the residuals are not perfectly normally distributed.
# - This suggests a non-linear model or feature transformations may help.
print("Residual statistics:")
print(f"  Mean: {residuals.mean():.4f} (should be ~0)")
print(f"  Std:  {residuals.std():.4f}")
```

</details>

---
## Exercise 3: Ridge vs. Lasso vs. ElasticNet

**Goal:** Compare Ridge, Lasso, and ElasticNet regression using 5-fold
cross-validation on a synthetic dataset with 20 features (only 5 informative).

**Tasks:**
1. Generate a synthetic regression dataset with `make_regression`.
2. Train Ridge, Lasso, and ElasticNet models (all with `alpha=1.0`).
3. Evaluate each with 5-fold CV using negative MSE.
4. Print the results in a comparison table.
5. Discuss which model performs best and why.

In [None]:
# Exercise 3 - Starter Code

from sklearn.datasets import make_regression
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.model_selection import cross_val_score

# Generate synthetic data: 20 features, only 5 are informative
X, y = make_regression(
    n_samples=500, n_features=20, n_informative=5,
    noise=10, random_state=42
)

print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")

# TODO 1: Create models with alpha=1.0
# ridge = Ridge(alpha=1.0, random_state=42)
# lasso = Lasso(alpha=1.0, random_state=42)
# elasticnet = ElasticNet(alpha=1.0, l1_ratio=0.5, random_state=42)

# TODO 2: Evaluate each with 5-fold CV
# scoring = 'neg_mean_squared_error'
# ridge_scores = cross_val_score(ridge, X, y, cv=5, scoring=scoring)
# lasso_scores = ...
# elasticnet_scores = ...

# TODO 3: Print comparison (convert neg MSE to positive)
# for name, scores in [('Ridge', ridge_scores), ('Lasso', lasso_scores), ('ElasticNet', elasticnet_scores)]:
#     mse = -scores.mean()
#     std = scores.std()
#     print(f"{name:12s} MSE: {mse:.2f} +/- {std:.2f}")

# TODO 4: Fit all models and compare number of zero coefficients
# (Hint: Lasso should zero out uninformative features)

### Solution

<details><summary>Click to reveal solution</summary>

```python
from sklearn.datasets import make_regression
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.model_selection import cross_val_score

X, y = make_regression(
    n_samples=500, n_features=20, n_informative=5,
    noise=10, random_state=42
)

# 1. Create models
ridge = Ridge(alpha=1.0)
lasso = Lasso(alpha=1.0, random_state=42)
elasticnet = ElasticNet(alpha=1.0, l1_ratio=0.5, random_state=42)

# 2. Evaluate
scoring = 'neg_mean_squared_error'
ridge_scores = cross_val_score(ridge, X, y, cv=5, scoring=scoring)
lasso_scores = cross_val_score(lasso, X, y, cv=5, scoring=scoring)
elasticnet_scores = cross_val_score(elasticnet, X, y, cv=5, scoring=scoring)

# 3. Print comparison
print("Model Comparison (5-fold CV):")
print("-" * 40)
for name, scores in [('Ridge', ridge_scores), ('Lasso', lasso_scores), ('ElasticNet', elasticnet_scores)]:
    mse = -scores.mean()
    std = scores.std()
    print(f"{name:12s} MSE: {mse:8.2f} +/- {std:.2f}")

# 4. Compare coefficients
print("\nNumber of zero coefficients:")
for name, model in [('Ridge', ridge), ('Lasso', lasso), ('ElasticNet', elasticnet)]:
    model.fit(X, y)
    n_zeros = np.sum(np.abs(model.coef_) < 1e-10)
    print(f"{name:12s}: {n_zeros} / {len(model.coef_)} features zeroed out")

# Lasso tends to zero out uninformative features (feature selection),
# ElasticNet zeros out some too, Ridge keeps all features non-zero.
```

</details>

---
## Exercise 4: LassoCV for Optimal Alpha

**Goal:** Use `LassoCV` to automatically find the best regularization strength
and identify which features get zeroed out.

**Tasks:**
1. Generate a synthetic dataset with `make_regression` (20 features, 5 informative).
2. Fit `LassoCV` with 5-fold CV.
3. Report the optimal alpha.
4. List which features are zeroed out (coefficient = 0).
5. Plot the coefficient values as a bar chart.

In [None]:
# Exercise 4 - Starter Code

from sklearn.datasets import make_regression
from sklearn.linear_model import LassoCV
from sklearn.model_selection import train_test_split

# Generate synthetic data
X, y, true_coefs = make_regression(
    n_samples=500, n_features=20, n_informative=5,
    noise=10, coef=True, random_state=42
)
feature_names = [f'feature_{i}' for i in range(20)]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# TODO 1: Fit LassoCV
# lasso_cv = LassoCV(cv=5, random_state=42)
# lasso_cv.fit(X_train, y_train)

# TODO 2: Report optimal alpha
# print(f"Optimal alpha: {lasso_cv.alpha_:.4f}")

# TODO 3: Report R2 on test set
# r2 = lasso_cv.score(X_test, y_test)
# print(f"Test R2: {r2:.4f}")

# TODO 4: Identify zeroed-out features
# for name, coef in zip(feature_names, lasso_cv.coef_):
#     status = 'ZERO' if abs(coef) < 1e-10 else f'{coef:.4f}'
#     print(f"{name}: {status}")

# TODO 5: Bar chart of coefficients
# plt.figure(figsize=(12, 5))
# plt.bar(feature_names, lasso_cv.coef_)
# plt.xticks(rotation=45)
# plt.ylabel('Coefficient Value')
# plt.title('Lasso Coefficients (LassoCV)')
# plt.tight_layout()
# plt.show()

### Solution

<details><summary>Click to reveal solution</summary>

```python
from sklearn.datasets import make_regression
from sklearn.linear_model import LassoCV
from sklearn.model_selection import train_test_split

X, y, true_coefs = make_regression(
    n_samples=500, n_features=20, n_informative=5,
    noise=10, coef=True, random_state=42
)
feature_names = [f'feature_{i}' for i in range(20)]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 1. Fit LassoCV
lasso_cv = LassoCV(cv=5, random_state=42)
lasso_cv.fit(X_train, y_train)

# 2. Optimal alpha
print(f"Optimal alpha: {lasso_cv.alpha_:.4f}")

# 3. Test R2
r2 = lasso_cv.score(X_test, y_test)
print(f"Test R2: {r2:.4f}")

# 4. Identify zeroed-out features
print("\nFeature coefficients:")
n_zeros = 0
for name, coef, true_c in zip(feature_names, lasso_cv.coef_, true_coefs):
    status = 'ZERO' if abs(coef) < 1e-10 else f'{coef:.4f}'
    true_status = 'ZERO' if abs(true_c) < 1e-10 else f'{true_c:.4f}'
    print(f"{name}: Lasso={status:>10s}  True={true_status:>10s}")
    if abs(coef) < 1e-10:
        n_zeros += 1
print(f"\nFeatures zeroed out: {n_zeros} / 20")

# 5. Bar chart
plt.figure(figsize=(12, 5))
x_pos = np.arange(len(feature_names))
plt.bar(x_pos - 0.2, lasso_cv.coef_, 0.4, label='Lasso', alpha=0.8)
plt.bar(x_pos + 0.2, true_coefs, 0.4, label='True', alpha=0.8)
plt.xticks(x_pos, feature_names, rotation=45)
plt.ylabel('Coefficient Value')
plt.title('Lasso vs True Coefficients')
plt.legend()
plt.tight_layout()
plt.show()
```

</details>

---
## Exercise 5: Full Pipeline - StandardScaler + Ridge

**Goal:** Build a complete pipeline that scales features and applies Ridge
regression, then evaluate with cross-validation.

**Tasks:**
1. Load the California Housing dataset.
2. Create a `Pipeline` with `StandardScaler` and `Ridge(alpha=1.0)`.
3. Evaluate with 5-fold cross-validation (R-squared scoring).
4. Compare the pipeline results against an unscaled Ridge model.
5. Print both results.

In [None]:
# Exercise 5 - Starter Code

from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

housing = fetch_california_housing()
X, y = housing.data, housing.target

# TODO 1: Create pipeline (StandardScaler -> Ridge)
# pipeline = Pipeline([
#     ('scaler', StandardScaler()),
#     ('ridge', Ridge(alpha=1.0))
# ])

# TODO 2: Evaluate pipeline with 5-fold CV (scoring='r2')
# pipeline_scores = cross_val_score(pipeline, X, y, cv=5, scoring='r2')

# TODO 3: Evaluate unscaled Ridge for comparison
# ridge_only = Ridge(alpha=1.0)
# unscaled_scores = cross_val_score(ridge_only, X, y, cv=5, scoring='r2')

# TODO 4: Print comparison
# print(f"Pipeline (Scaled + Ridge) R2: {pipeline_scores.mean():.4f} +/- {pipeline_scores.std():.4f}")
# print(f"Unscaled Ridge          R2: {unscaled_scores.mean():.4f} +/- {unscaled_scores.std():.4f}")

### Solution

<details><summary>Click to reveal solution</summary>

```python
from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

housing = fetch_california_housing()
X, y = housing.data, housing.target

# 1. Create pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('ridge', Ridge(alpha=1.0))
])

# 2. Evaluate pipeline
pipeline_scores = cross_val_score(pipeline, X, y, cv=5, scoring='r2')

# 3. Evaluate unscaled Ridge
ridge_only = Ridge(alpha=1.0)
unscaled_scores = cross_val_score(ridge_only, X, y, cv=5, scoring='r2')

# 4. Print comparison
print("Pipeline vs Unscaled Ridge:")
print(f"Pipeline (Scaled + Ridge) R2: {pipeline_scores.mean():.4f} +/- {pipeline_scores.std():.4f}")
print(f"Unscaled Ridge           R2: {unscaled_scores.mean():.4f} +/- {unscaled_scores.std():.4f}")
print(f"\nScaling {'improved' if pipeline_scores.mean() > unscaled_scores.mean() else 'did not improve'} performance.")
```

</details>