# Part 1: Regression Task (California Housing)

In [17]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("camnugent/california-housing-prices")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'california-housing-prices' dataset.
Path to dataset files: /kaggle/input/california-housing-prices


## Task 1: Load and Split Dataset

In [24]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Load the dataset from Kaggle
df = pd.read_csv('/kaggle/input/california-housing-prices/housing.csv')

print(f"Dataset loaded: {df.shape[0]} samples, {df.shape[1]} features")

# Handle missing values
df['total_bedrooms'] = df['total_bedrooms'].fillna(df['total_bedrooms'].median())

# Prepare features and target
X = df.drop('median_house_value', axis=1)
y = df['median_house_value']

# Convert categorical feature
X = pd.get_dummies(X, columns=['ocean_proximity'], drop_first=True)

# Convert to numpy
X = X.values
y = y.values

# Split into training (80%) and test (20%)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"\nTraining set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print(f"Features: {X_train.shape[1]}")


Dataset loaded: 20640 samples, 10 features

Training set: (16512, 12)
Test set: (4128, 12)
Features: 12


## Task 2, Step 1: Baseline Model (No Regularization)

In [25]:

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Build Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Observe coefficients
print("Coefficients:")
for i in range(min(5, len(model.coef_))):  # Show first 5
    print(f"  Feature {i}: {model.coef_[i]:.6f}")
print(f"  ... and {len(model.coef_) - 5} more")
print(f"Intercept: {model.intercept_:.6f}")

# Compute MSE
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

mse_train = mean_squared_error(y_train, y_train_pred)
mse_test = mean_squared_error(y_test, y_test_pred)

print(f"\nTraining MSE: {mse_train:.6f}")
print(f"Test MSE: {mse_test:.6f}")


Coefficients:
  Feature 0: -26838.273372
  Feature 1: -25468.352050
  Feature 2: 1102.185084
  Feature 3: -6.021506
  Feature 4: 102.789395
  ... and 7 more
Intercept: -2275547.381716

Training MSE: 4683203783.504253
Test MSE: 4908476721.156583


## Task 2, Step 2: Hyperparameter Tuning

In [26]:

from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import GridSearchCV

# Define alphas
alphas = [0.001, 0.01, 0.1, 1, 10, 100, 1000]

# Ridge Regression tuning
ridge = Ridge()
ridge_grid = GridSearchCV(ridge, {'alpha': alphas}, cv=5, scoring='neg_mean_squared_error')
ridge_grid.fit(X_train, y_train)

print(f"Ridge - Best alpha: {ridge_grid.best_params_['alpha']}")

# Lasso Regression tuning
lasso = Lasso(max_iter=10000)
lasso_grid = GridSearchCV(lasso, {'alpha': alphas}, cv=5, scoring='neg_mean_squared_error')
lasso_grid.fit(X_train, y_train)

print(f"Lasso - Best alpha: {lasso_grid.best_params_['alpha']}")

# Test set evaluation
ridge_pred = ridge_grid.predict(X_test)
lasso_pred = lasso_grid.predict(X_test)

ridge_mse = mean_squared_error(y_test, ridge_pred)
lasso_mse = mean_squared_error(y_test, lasso_pred)

print(f"\nTest MSE - Ridge: {ridge_mse:.6f}")
print(f"Test MSE - Lasso: {lasso_mse:.6f}")


Ridge - Best alpha: 1
Lasso - Best alpha: 0.001

Test MSE - Ridge: 4910037869.229350
Test MSE - Lasso: 4908476915.683493


## Task 2, Step 3: Regularization Experiments (L1 vs L2)

In [27]:

# Train models with best parameters
ridge_best = Ridge(alpha=ridge_grid.best_params_['alpha'])
lasso_best = Lasso(alpha=lasso_grid.best_params_['alpha'], max_iter=10000)

ridge_best.fit(X_train, y_train)
lasso_best.fit(X_train, y_train)

# Compare coefficients
print("\nCoefficient Comparison (first 10 features):")
print("Feature\tBaseline\t\tRidge\t\t\tLasso")
for i in range(10):
    print(f"{i}\t{model.coef_[i]:.6f}\t{ridge_best.coef_[i]:.6f}\t{lasso_best.coef_[i]:.6f}")

# Count zero coefficients
zero_lasso = sum(lasso_best.coef_ == 0)
print(f"\nZero coefficients in Lasso: {zero_lasso}/{len(lasso_best.coef_)}")

# Performance comparison
ridge_train_mse = mean_squared_error(y_train, ridge_best.predict(X_train))
ridge_test_mse = mean_squared_error(y_test, ridge_best.predict(X_test))

lasso_train_mse = mean_squared_error(y_train, lasso_best.predict(X_train))
lasso_test_mse = mean_squared_error(y_test, lasso_best.predict(X_test))

print(f"\nPerformance Comparison:")
print(f"{'Model':<10} {'Train MSE':<15} {'Test MSE':<15}")
print("-" * 40)
print(f"{'Baseline':<10} {mse_train:<15.6f} {mse_test:<15.6f}")
print(f"{'Ridge':<10} {ridge_train_mse:<15.6f} {ridge_test_mse:<15.6f}")
print(f"{'Lasso':<10} {lasso_train_mse:<15.6f} {lasso_test_mse:<15.6f}")

print("\nDiscussion:")
print("1. L1 produces sparse coefficients (feature selection)")
print("2. L2 shrinks coefficients without zeroing them")
print("3. Regularization reduces variance, prevents overfitting")
print("4. Excessive regularization increases bias")




Coefficient Comparison (first 10 features):
Feature	Baseline		Ridge			Lasso
0	-26838.273372	-26860.198129	-26838.273386
1	-25468.352050	-25493.131209	-25468.353841
2	1102.185084	1102.629029	1102.185112
3	-6.021506	-6.019346	-6.021506
4	102.789395	102.927379	102.789411
5	-38.172906	-38.177632	-38.172907
6	48.252753	48.099003	48.252736
7	39473.975175	39470.951501	39473.974756
8	-39786.656161	-39759.650387	-39786.649960
9	136125.072615	108852.317864	136120.938950

Zero coefficients in Lasso: 0/12

Performance Comparison:
Model      Train MSE       Test MSE       
----------------------------------------
Baseline   4683203783.504253 4908476721.156583
Ridge      4683383574.687478 4910037869.229350
Lasso      4683203783.508414 4908476915.683493

Discussion:
1. L1 produces sparse coefficients (feature selection)
2. L2 shrinks coefficients without zeroing them
3. Regularization reduces variance, prevents overfitting
4. Excessive regularization increases bias


# PART 2: CLASSIFICATION TASK (Breast Cancer)

## Task 1: Load and Split Dataset

In [28]:

from sklearn.datasets import load_breast_cancer

X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set size: {X_train.shape}")
print(f"Test set size: {X_test.shape}")


Training set size: (455, 30)
Test set size: (114, 30)


## Task 2, Step 1: Baseline Model (No Regularization)

In [11]:


from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Build Logistic Regression model
logreg_baseline = LogisticRegression(penalty=None, max_iter=10000, solver='lbfgs')
logreg_baseline.fit(X_train, y_train)

# Observe coefficients
print("Number of coefficients:", len(logreg_baseline.coef_[0]))

# Compute accuracy
y_train_pred = logreg_baseline.predict(X_train)
y_test_pred = logreg_baseline.predict(X_test)

train_acc = accuracy_score(y_train, y_train_pred)
test_acc = accuracy_score(y_test, y_test_pred)

print(f"\nTraining Accuracy: {train_acc:.4f}")
print(f"Test Accuracy: {test_acc:.4f}")
print("Step 1 completed ✓")

Number of coefficients: 30

Training Accuracy: 0.9868
Test Accuracy: 0.9825
Step 1 completed ✓


## Task 2, Step 2: Hyperparameter Tuning

In [14]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']
}

# Grid Search
logreg = LogisticRegression(max_iter=10000)
grid_search = GridSearchCV(logreg, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)
print("Best CV accuracy:", grid_search.best_score_)

# Evaluate on test set
best_model = grid_search.best_estimator_
test_acc_tuned = accuracy_score(y_test, best_model.predict(X_test))
print(f"Test Accuracy with best model: {test_acc_tuned:.4f}")
print("Step 2 completed ✓")

Best parameters: {'C': 100, 'penalty': 'l1', 'solver': 'liblinear'}
Best CV accuracy: 0.9670329670329672
Test Accuracy with best model: 0.9825
Step 2 completed ✓


## Task 2, Step 3: Regularization Experiments (L1 vs L2)

In [30]:

# Get best parameters for L1 and L2
cv_results = grid_search.cv_results_
best_l1_idx = None
best_l2_idx = None

for i, params in enumerate(cv_results['params']):
    if params['penalty'] == 'l1':
        if best_l1_idx is None or cv_results['mean_test_score'][i] > cv_results['mean_test_score'][best_l1_idx]:
            best_l1_idx = i
    elif params['penalty'] == 'l2':
        if best_l2_idx is None or cv_results['mean_test_score'][i] > cv_results['mean_test_score'][best_l2_idx]:
            best_l2_idx = i

best_l1_params = cv_results['params'][best_l1_idx]
best_l2_params = cv_results['params'][best_l2_idx]

print("Best L1 parameters:", best_l1_params)
print("Best L2 parameters:", best_l2_params)

# Train models with best parameters
logreg_l1 = LogisticRegression(**best_l1_params, max_iter=10000)
logreg_l2 = LogisticRegression(**best_l2_params, max_iter=10000)

logreg_l1.fit(X_train, y_train)
logreg_l2.fit(X_train, y_train)

# Compare coefficients
print("\nCoefficient Comparison (first 10 features):")
print("Feature\t\tBaseline\t\tL1\t\t\tL2")
for i in range(10):
    print(f"{i}\t\t{logreg_baseline.coef_[0][i]:.6f}\t\t{logreg_l1.coef_[0][i]:.6f}\t\t{logreg_l2.coef_[0][i]:.6f}")

# Count zero coefficients
zero_l1 = sum(logreg_l1.coef_[0] == 0)
print(f"\nZero coefficients in L1: {zero_l1}/{len(logreg_l1.coef_[0])}")

# Evaluate and compare accuracy
acc_l1_train = accuracy_score(y_train, logreg_l1.predict(X_train))
acc_l1_test = accuracy_score(y_test, logreg_l1.predict(X_test))

acc_l2_train = accuracy_score(y_train, logreg_l2.predict(X_train))
acc_l2_test = accuracy_score(y_test, logreg_l2.predict(X_test))

print(f"\nAccuracy Comparison:")
print(f"{'Model':<15} {'Train Acc':<15} {'Test Acc':<15}")
print("-" * 45)
print(f"{'Baseline':<15} {train_acc:<15.4f} {test_acc:<15.4f}")
print(f"{'L1':<15} {acc_l1_train:<15.4f} {acc_l1_test:<15.4f}")
print(f"{'L2':<15} {acc_l2_train:<15.4f} {acc_l2_test:<15.4f}")

print("\nDiscussion:")
print("1. L1 produces sparse coefficients (feature selection)")
print("2. L2 shrinks all coefficients but rarely zero")
print("3. Regularization reduces variance and mitigates overfitting")
print("4. Overly strong regularization may increase bias")


Best L1 parameters: {'C': 100, 'penalty': 'l1', 'solver': 'liblinear'}
Best L2 parameters: {'C': 10, 'penalty': 'l2', 'solver': 'liblinear'}

Coefficient Comparison (first 10 features):
Feature		Baseline		L1			L2
0		-1.249059		0.753783		4.488356
1		-0.019386		-0.108887		0.271960
2		0.199959		0.096381		-0.519464
3		0.004815		-0.002252		-0.007443
4		-17.215768		0.000000		-0.721446
5		13.319275		47.268661		-0.695211
6		-22.134905		-12.167407		-1.741763
7		-39.793068		-136.120490		-1.643938
8		8.984611		19.742938		-0.891149
9		2.937568		0.000000		0.036918

Zero coefficients in L1: 8/30

Accuracy Comparison:
Model           Train Acc       Test Acc       
---------------------------------------------
Baseline        0.9868          0.9825         
L1              0.9890          0.9825         
L2              0.9692          0.9561         

Discussion:
1. L1 produces sparse coefficients (feature selection)
2. L2 shrinks all coefficients but rarely zero
3. Regularization reduces variance a