Part 1: Regression Task – California Housing
Task 1: Load and Split Dataset (80% train, 20% test)

In [4]:
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split

X, y = load_diabetes(return_X_y=True)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print(X_train.shape, X_test.shape)


(353, 10) (89, 10)


Task 2: Regression Experiments
Step 1: Baseline Linear Regression (No Regularization)

In [5]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Model
lr = LinearRegression()
lr.fit(X_train, y_train)

# Predictions
y_train_pred = lr.predict(X_train)
y_test_pred = lr.predict(X_test)

# MSE
train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)

print("Training MSE:", train_mse)
print("Test MSE:", test_mse)

# Coefficients
print("Coefficients:", lr.coef_)


Training MSE: 2868.549702835577
Test MSE: 2900.193628493482
Coefficients: [  37.90402135 -241.96436231  542.42875852  347.70384391 -931.48884588
  518.06227698  163.41998299  275.31790158  736.1988589    48.67065743]


Step 2: Hyperparameter Tuning (Ridge & Lasso)
Ridge Regression (L2)

In [6]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

ridge = Ridge()

alpha_grid = {'alpha': [0.01, 0.1, 1, 10, 100]}

ridge_cv = GridSearchCV(
    ridge, alpha_grid, cv=5, scoring='neg_mean_squared_error'
)

ridge_cv.fit(X_train, y_train)

print("Best alpha (Ridge):", ridge_cv.best_params_)

ridge_best = ridge_cv.best_estimator_

ridge_test_pred = ridge_best.predict(X_test)
ridge_test_mse = mean_squared_error(y_test, ridge_test_pred)

print("Ridge Test MSE:", ridge_test_mse)


Best alpha (Ridge): {'alpha': 0.1}
Ridge Test MSE: 2856.4868876706537


Lasso Regression (L1)

In [7]:
from sklearn.linear_model import Lasso

lasso = Lasso(max_iter=10000)

lasso_cv = GridSearchCV(
    lasso, alpha_grid, cv=5, scoring='neg_mean_squared_error'
)

lasso_cv.fit(X_train, y_train)

print("Best alpha (Lasso):", lasso_cv.best_params_)

lasso_best = lasso_cv.best_estimator_

lasso_test_pred = lasso_best.predict(X_test)
lasso_test_mse = mean_squared_error(y_test, lasso_test_pred)

print("Lasso Test MSE:", lasso_test_mse)


Best alpha (Lasso): {'alpha': 0.1}
Lasso Test MSE: 2798.193485169719


Step 3: L1 vs L2 Comparison

In [8]:
import numpy as np

print("Ridge coefficients:")
print(ridge_best.coef_)

print("\nLasso coefficients:")
print(lasso_best.coef_)

print("\nNumber of zero coefficients in Lasso:",
      np.sum(lasso_best.coef_ == 0))


Ridge coefficients:
[  42.85566976 -205.49431899  505.08903304  317.0932049  -108.50026183
  -86.23673333 -190.36318008  151.70708637  392.28931896   79.9081772 ]

Lasso coefficients:
[   0.         -152.66477923  552.69777529  303.36515791  -81.36500664
   -0.         -229.25577639    0.          447.91952518   29.64261704]

Number of zero coefficients in Lasso: 3


Part 2: Classification Task – Breast Cancer Dataset
Task 1: Load and Split Dataset

In [9]:
from sklearn.datasets import load_breast_cancer

X, y = load_breast_cancer(return_X_y=True)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


Task 2: Classification Experiments
Step 1: Baseline Logistic Regression

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

log_reg = LogisticRegression(max_iter=10000)
log_reg.fit(X_train, y_train)

train_acc = accuracy_score(y_train, log_reg.predict(X_train))
test_acc = accuracy_score(y_test, log_reg.predict(X_test))

print("Training Accuracy:", train_acc)
print("Test Accuracy:", test_acc)

print("Coefficients:", log_reg.coef_)


Training Accuracy: 0.9582417582417583
Test Accuracy: 0.956140350877193
Coefficients: [[ 1.0274368   0.22145051 -0.36213488  0.0254667  -0.15623532 -0.23771256
  -0.53255786 -0.28369224 -0.22668189 -0.03649446 -0.09710208  1.3705667
  -0.18140942 -0.08719575 -0.02245523  0.04736092 -0.04294784 -0.03240188
  -0.03473732  0.01160522  0.11165329 -0.50887722 -0.01555395 -0.016857
  -0.30773117 -0.77270908 -1.42859535 -0.51092923 -0.74689363 -0.10094404]]


Step 2: Hyperparameter Tuning (C and Penalty)

In [11]:
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']
}

log_cv = GridSearchCV(
    LogisticRegression(max_iter=10000),
    param_grid,
    cv=5,
    scoring='accuracy'
)

log_cv.fit(X_train, y_train)

print("Best Parameters:", log_cv.best_params_)


Best Parameters: {'C': 100, 'penalty': 'l1', 'solver': 'liblinear'}


Step 3: L1 vs L2 Logistic RegressionStep 3: L1 vs L2 Logistic Regression

In [12]:
best_model = log_cv.best_estimator_

train_acc = accuracy_score(y_train, best_model.predict(X_train))
test_acc = accuracy_score(y_test, best_model.predict(X_test))

print("Training Accuracy:", train_acc)
print("Test Accuracy:", test_acc)

print("Coefficients:", best_model.coef_)
print("Zero coefficients:", (best_model.coef_ == 0).sum())


Training Accuracy: 0.989010989010989
Test Accuracy: 0.9824561403508771
Coefficients: [[ 7.29579738e-01 -1.07477424e-01  9.31610815e-02 -1.75208049e-03
   0.00000000e+00  4.74818247e+01 -1.19452734e+01 -1.36000562e+02
   1.98212899e+01  0.00000000e+00  0.00000000e+00  1.72673802e+00
   0.00000000e+00 -1.97689997e-01  0.00000000e+00  0.00000000e+00
   4.99800105e+01  0.00000000e+00  1.85059190e+01  0.00000000e+00
   1.89501236e-01 -4.39015164e-01  5.76214783e-02 -2.08700005e-02
  -2.17947402e+01  7.98918690e+00 -1.46415918e+01 -2.52076522e+01
  -2.53448161e+01  0.00000000e+00]]
Zero coefficients: 9
