# Part 1: Regression Task (California Housing)

Task 1: Load and Split Dataset

In [None]:
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error


In [None]:

# Load California Housing dataset and split 80/20
X, y = fetch_california_housing(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


Data folder: /root/scikit_learn_data


# Task 2: Complete all the Task

• Regression Task (California Housing):

– Step 1: Baseline Model (No Regularization) Build a Linear Regression model without
any regularization.

In [None]:
# Baseline Linear Regression
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

print("Baseline Linear Regression")
print("Coefficients:", lin_reg.coef_)
print("Intercept:", lin_reg.intercept_)

# Predictions
y_train_pred = lin_reg.predict(X_train)
y_test_pred = lin_reg.predict(X_test)

# MSE on train and test
print("Train MSE:", mean_squared_error(y_train, y_train_pred))
print("Test MSE:", mean_squared_error(y_test, y_test_pred))


– Step 2: Hyperparameter Tuning Use GridSearchCV or RandomizedSearchCV to tune
hyperparameters for Ridge and Lasso regression models.

In [None]:
# Alpha grid for Ridge and Lasso
alpha_grid = np.logspace(-3, 0, 13)  # 0.001 to 1

ridge = Ridge(random_state=42)
lasso = Lasso(random_state=42, max_iter=10000)

ridge_cv = GridSearchCV(
    ridge,
    {"alpha": alpha_grid},
    cv=5,
    scoring="neg_mean_squared_error",
    n_jobs=-1,
)

lasso_cv = GridSearchCV(
    lasso,
    {"alpha": alpha_grid},
    cv=5,
    scoring="neg_mean_squared_error",
    n_jobs=-1,
)

ridge_cv.fit(X_train, y_train)
lasso_cv.fit(X_train, y_train)

print("\nHyperparameter Tuning Results")
print("Best Ridge alpha:", ridge_cv.best_params_["alpha"])
print("Best Ridge CV MSE:", -ridge_cv.best_score_)
print("Best Lasso alpha:", lasso_cv.best_params_["alpha"])
print("Best Lasso CV MSE:", -lasso_cv.best_score_)


– Step 3: Regularization Experiments (L1 vs L2) Train L1 (Lasso) and L2 (Ridge) regres-
sion models using the optimal hyperparameters.

In [33]:
# Best estimators from GridSearchCV
best_ridge = ridge_cv.best_estimator_
best_lasso = lasso_cv.best_estimator_

# Predictions with best Ridge and Lasso
ridge_train_pred = best_ridge.predict(X_train)
ridge_test_pred = best_ridge.predict(X_test)

lasso_train_pred = best_lasso.predict(X_train)
lasso_test_pred = best_lasso.predict(X_test)

print("\nRidge (L2) with best alpha")
print("Coefficients:", best_ridge.coef_)
print("Train MSE:", mean_squared_error(y_train, ridge_train_pred))
print("Test MSE:", mean_squared_error(y_test, ridge_test_pred))

print("\nLasso (L1) with best alpha")
print("Coefficients:", best_lasso.coef_)
print("Train MSE:", mean_squared_error(y_train, lasso_train_pred))
print("Test MSE:", mean_squared_error(y_test, lasso_test_pred))

print("\nNumber of zero coefficients in Lasso:", np.sum(best_lasso.coef_ == 0))
print("Number of zero coefficients in Ridge:", np.sum(best_ridge.coef_ == 0))



Ridge (L2) with best alpha
Coefficients: [  42.33215803 -219.70224625  525.84455459  327.90895925 -155.27747239
  -65.55682386 -176.83023234  160.61559898  422.12347747   70.18920108]
Train MSE: 2899.0575672882524
Test MSE: 2864.6382255813583

Lasso (L1) with best alpha
Coefficients: [   8.6963877  -187.0826068   554.88504012  320.41189319 -148.25316407
   -0.         -209.30900722   43.99621921  470.76873329   42.28598551]
Train MSE: 2907.1621670305
Test MSE: 2814.3876739418333

Number of zero coefficients in Lasso: 1
Number of zero coefficients in Ridge: 0


# Part 2: Classification Task (Breast Cancer)

Task 1: Load and Split Dataset

In [34]:
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV


In [35]:
# Load and splitdataset
X_cls, y_cls = load_breast_cancer(return_X_y=True)

X_train_cls, X_test_cls, y_train_cls, y_test_cls = train_test_split(
    X_cls, y_cls, test_size=0.2, random_state=42
)


Task 2: Complete all the Task
– Step 1: Baseline Model (No Regularization) Build a Logistic Regression model without
specifying any regularization (default settings).

In [36]:
baseline = LogisticRegression(max_iter=10000)
baseline.fit(X_train_cls, y_train_cls)

print("Baseline coefficients:\n", baseline.coef_)
print("Baseline train acc:", accuracy_score(y_train_cls, baseline.predict(X_train_cls)))
print("Baseline test acc:", accuracy_score(y_test_cls, baseline.predict(X_test_cls)))



Baseline coefficients:
 [[ 1.0274368   0.22145051 -0.36213488  0.0254667  -0.15623532 -0.23771256
  -0.53255786 -0.28369224 -0.22668189 -0.03649446 -0.09710208  1.3705667
  -0.18140942 -0.08719575 -0.02245523  0.04736092 -0.04294784 -0.03240188
  -0.03473732  0.01160522  0.11165329 -0.50887722 -0.01555395 -0.016857
  -0.30773117 -0.77270908 -1.42859535 -0.51092923 -0.74689363 -0.10094404]]
Baseline train acc: 0.9582417582417583
Baseline test acc: 0.956140350877193


– Step 2: Hyperparameter Tuning Use GridSearchCV or RandomizedSearchCV to tune
hyperparameters for logistic regression models with regularization.

In [37]:
param_grid = {
    "penalty": ["l1", "l2"],
    "C": np.logspace(-3, 3, 7),
    "solver": ["liblinear"],  # supports both l1 and l2
}

logreg = LogisticRegression(max_iter=10000)

grid = GridSearchCV(logreg, param_grid, cv=5, scoring="accuracy", n_jobs=-1)
grid.fit(X_train_cls, y_train_cls)

print("\nBest params:", grid.best_params_)
print("Best CV acc:", grid.best_score_)
print("Test acc (best model):",
      accuracy_score(y_test_cls, grid.best_estimator_.predict(X_test_cls)))



Best params: {'C': np.float64(100.0), 'penalty': 'l1', 'solver': 'liblinear'}
Best CV acc: 0.9670329670329672
Test acc (best model): 0.9824561403508771


Step 3: Regularization Experiments (L1 vs L2)

In [38]:
best_C = grid.best_params_["C"]

# L1 model
logreg_l1 = LogisticRegression(penalty="l1", C=best_C,
                               solver="liblinear", max_iter=10000)
logreg_l1.fit(X_train_cls, y_train_cls)

print("\nL1 coefficients:\n", logreg_l1.coef_)
print("L1 zero coeffs:", np.sum(logreg_l1.coef_ == 0))
print("L1 train acc:", accuracy_score(y_train_cls, logreg_l1.predict(X_train_cls)))
print("L1 test acc:", accuracy_score(y_test_cls, logreg_l1.predict(X_test_cls)))

# L2 model
logreg_l2 = LogisticRegression(penalty="l2", C=best_C,
                               solver="liblinear", max_iter=10000)
logreg_l2.fit(X_train_cls, y_train_cls)

print("\nL2 coefficients:\n", logreg_l2.coef_)
print("L2 zero coeffs:", np.sum(logreg_l2.coef_ == 0))
print("L2 train acc:", accuracy_score(y_train_cls, logreg_l2.predict(X_train_cls)))
print("L2 test acc:", accuracy_score(y_test_cls, logreg_l2.predict(X_test_cls)))



L1 coefficients:
 [[ 7.39482762e-01 -1.08576906e-01  1.04134313e-01 -2.63067926e-03
   0.00000000e+00  4.70821620e+01 -1.20132423e+01 -1.36067471e+02
   1.97045752e+01  0.00000000e+00  0.00000000e+00  1.73365254e+00
   0.00000000e+00 -1.97749594e-01  0.00000000e+00  0.00000000e+00
   5.02459560e+01  0.00000000e+00  1.85278531e+01  0.00000000e+00
   1.69760139e-01 -4.39015075e-01  5.68516254e-02 -2.06397936e-02
  -2.17868055e+01  7.99056169e+00 -1.46434989e+01 -2.52748434e+01
  -2.52667860e+01  0.00000000e+00]]
L1 zero coeffs: 9
L1 train acc: 0.989010989010989
L1 test acc: 0.9824561403508771

L2 coefficients:
 [[ 5.40272741  0.26573732 -0.52651203 -0.02095479 -2.29922151 -0.2169419
  -3.56980218 -5.0110607  -2.26418385  0.36762747 -0.58036525  3.84169255
  -0.63875637 -0.10714305 -0.40591246  3.57237002  4.36577514 -0.26636925
   0.39354765  0.62630698 -0.20718582 -0.68999884  0.17661962 -0.01813087
  -4.67716843 -0.0140846  -4.46493229 -7.61271125 -6.83571871  0.57463274]]
L2 zero coe