<a href="https://colab.research.google.com/github/sajalbardewa/5CS037/blob/main/worksheet7AI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LogisticRegression
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.datasets import fetch_california_housing, load_breast_cancer

In [None]:
# Load dataset
X, y = fetch_california_housing(return_X_y=True, download_if_missing=False)

# Split into training (80%) and test (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

OSError: Data not found and `download_if_missing` is False

In [None]:
# Train baseline model
base_reg = LinearRegression()
base_reg.fit(X_train, y_train)

# Predict and compute MSE
train_mse = mean_squared_error(y_train, base_reg.predict(X_train))
test_mse = mean_squared_error(y_test, base_reg.predict(X_test))

print(f"Baseline Train MSE: {train_mse}")
print(f"Baseline Test MSE: {test_mse}")
print(f"Baseline Coefficients: {base_reg.coef_}")

In [None]:
# Define alpha grid
param_grid = {'alpha': [0.01, 0.1, 1, 10, 100]}

# Tune Ridge (L2)
ridge_cv = GridSearchCV(Ridge(), param_grid, cv=5)
ridge_cv.fit(X_train, y_train)

# Tune Lasso (L1)
lasso_cv = GridSearchCV(Lasso(), param_grid, cv=5)
lasso_cv.fit(X_train, y_train)

print(f"Best Ridge Alpha: {ridge_cv.best_params_}")
print(f"Best Lasso Alpha: {lasso_cv.best_params_}") [5]


In [None]:
# Train with best alphas
best_ridge = ridge_cv.best_estimator_
best_lasso = lasso_cv.best_estimator_


In [None]:
# Train with best alphas
best_ridge = ridge_cv.best_estimator_
best_lasso = lasso_cv.best_estimator_

# Compare coefficients
print(f"Ridge Coefficients: {best_ridge.coef_}")
print(f"Lasso Coefficients (Note zeros/sparsity): {best_lasso.coef_}") [6]

# Evaluate MSE
print(f"Ridge Test MSE: {mean_squared_error(y_test, best_ridge.predict(X_test))}")
print(f"Lasso Test MSE: {mean_squared_error(y_test, best_lasso.predict(X_test))}") [6]

In [None]:
# Load dataset
X_c, y_c = load_breast_cancer(return_X_y=True)

# Split 80/20
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X_c, y_c, test_size=0.2, random_state=42)

In [None]:
# Baseline Logistic Regression
base_clf = LogisticRegression(max_iter=10000) # Increased max_iter for convergence
base_clf.fit(X_train_c, y_train_c)

# Accuracy
train_acc = accuracy_score(y_train_c, base_clf.predict(X_train_c))
test_acc = accuracy_score(y_test_c, base_clf.predict(X_test_c))

print(f"Baseline Train Accuracy: {train_acc}")
print(f"Baseline Test Accuracy: {test_acc}") [8, 9]

In [None]:
# Grid for Logistic Regression (Note: 'liblinear' solver supports both l1 and l2)
clf_param_grid = {'C': [0.01, 0.1, 1, 10, 100], 'penalty': ['l1', 'l2']}

clf_cv = GridSearchCV(LogisticRegression(solver='liblinear'), clf_param_grid, cv=5)
clf_cv.fit(X_train_c, y_train_c)

print(f"Best Classification Params: {clf_cv.best_params_}") [9, 10]

In [None]:
# Train L1 and L2 specifically with best C
best_C = clf_cv.best_params_['C']
l1_model = LogisticRegression(penalty='l1', C=best_C, solver='liblinear').fit(X_train_c, y_train_c)
l2_model = LogisticRegression(penalty='l2', C=best_C, solver='liblinear').fit(X_train_c, y_train_c)

# Compare sparsity
print(f"L1 Sparse Coeffs: {l1_model.coef_}")
print(f"L2 Shrunken Coeffs: {l2_model.coef_}") [10]

# Accuracy comparison
print(f"L1 Test Accuracy: {accuracy_score(y_test_c, l1_model.predict(X_test_c))}")
print(f"L2 Test Accuracy: {accuracy_score(y_test_c, l2_model.predict(X_test_c))}") [11]