<a href="https://colab.research.google.com/github/prasunamishra/5CS037/blob/main/Week7_Prasuna.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Part 1: Regression Task (California Housing)

In [8]:
import numpy as np
import pandas as pd
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import GridSearchCV
import numpy as np




In [9]:
# Load dataset
data = pd.read_csv("/content/sample_data/california_housiing.csv")

#categorical variables to numerical
data = pd.get_dummies(data, drop_first=True)
data = data.fillna(data.mean(numeric_only=True))
X = data.drop("median_house_value", axis=1)
y = data["median_house_value"]

# Train-test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [10]:
# Baseline Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)




In [11]:
# Predictions on training and test data
train_pred = lr.predict(X_train)
test_pred = lr.predict(X_test)

# Evaluation
print("Train MSE:", mean_squared_error(y_train, train_pred))
print("Test MSE:", mean_squared_error(y_test, test_pred))

#Display learned coefficients
print("Coefficients:", lr.coef_)

Train MSE: 4683203783.504253
Test MSE: 4904399775.949265
Coefficients: [-2.68382734e+04 -2.54683520e+04  1.10218508e+03 -6.02150567e+00
  1.02789395e+02 -3.81729064e+01  4.82527528e+01  3.94739752e+04
 -3.97866562e+04  1.36125073e+05 -5.13664222e+03  3.43114007e+03]


In [12]:
# Hyperparameter tuning for Ridge & Lasso
alpha_grid = {'alpha': [0.01, 0.1, 1, 10, 100]}

# Ridge Regression with Cross-Validation
ridge = Ridge()
ridge_cv = GridSearchCV(ridge, alpha_grid, cv=5)
ridge_cv.fit(X_train, y_train)

# Lasso Regression with Cross-Validation
lasso = Lasso(max_iter=10000)
lasso_cv = GridSearchCV(lasso, alpha_grid, cv=5)
lasso_cv.fit(X_train, y_train)

print("Best Ridge Alpha:", ridge_cv.best_params_)
print("Best Lasso Alpha:", lasso_cv.best_params_)


Best Ridge Alpha: {'alpha': 1}
Best Lasso Alpha: {'alpha': 0.01}


In [13]:
# Extract best models
best_ridge = ridge_cv.best_estimator_
best_lasso = lasso_cv.best_estimator_

# Predictions
ridge_train_pred = best_ridge.predict(X_train)
ridge_test_pred = best_ridge.predict(X_test)

lasso_train_pred = best_lasso.predict(X_train)
lasso_test_pred = best_lasso.predict(X_test)

# Evaluate Ridge
print("\nRidge Regression")
print("Train MSE:", mean_squared_error(y_train, ridge_train_pred))
print("Test MSE:", mean_squared_error(y_test, ridge_test_pred))
print("Coefficients:\n", best_ridge.coef_)

# Evaluate Lasso
print("\nLasso Regression")
print("Train MSE:", mean_squared_error(y_train, lasso_train_pred))
print("Test MSE:", mean_squared_error(y_test, lasso_test_pred))
print("Coefficients:\n", best_lasso.coef_)



Ridge Regression
Train MSE: 4683383574.687479
Test MSE: 4905952780.84936
Coefficients:
 [-2.68601981e+04 -2.54931312e+04  1.10262903e+03 -6.01934563e+00
  1.02927379e+02 -3.81776325e+01  4.80990028e+01  3.94709515e+04
 -3.97596504e+04  1.08852318e+05 -5.14260657e+03  3.41116707e+03]

Lasso Regression
Train MSE: 4683203783.920323
Test MSE: 4904401711.447271
Coefficients:
 [-2.68382735e+04 -2.54683700e+04  1.10218536e+03 -6.02150499e+00
  1.02789549e+02 -3.81729119e+01  4.82525853e+01  3.94739710e+04
 -3.97865941e+04  1.36083736e+05 -5.13649993e+03  3.43105996e+03]


# Part 2: Classification Task (Breast Cancer)

In [14]:
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Load dataset
X, y = load_breast_cancer(return_X_y=True)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [15]:
# Baseline Logistic Regression
log_reg = LogisticRegression(max_iter=10000)
log_reg.fit(X_train, y_train)



In [16]:
train_pred = log_reg.predict(X_train)
test_pred = log_reg.predict(X_test)

print("Baseline Logistic Regression")
print("Train Accuracy:", accuracy_score(y_train, train_pred))
print("Test Accuracy:", accuracy_score(y_test, test_pred))
print("Coefficients:\n", log_reg.coef_)


Baseline Logistic Regression
Train Accuracy: 0.9582417582417583
Test Accuracy: 0.956140350877193
Coefficients:
 [[ 1.0274368   0.22145051 -0.36213488  0.0254667  -0.15623532 -0.23771256
  -0.53255786 -0.28369224 -0.22668189 -0.03649446 -0.09710208  1.3705667
  -0.18140942 -0.08719575 -0.02245523  0.04736092 -0.04294784 -0.03240188
  -0.03473732  0.01160522  0.11165329 -0.50887722 -0.01555395 -0.016857
  -0.30773117 -0.77270908 -1.42859535 -0.51092923 -0.74689363 -0.10094404]]


In [17]:
# Hyperparameter tuning with CV
param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']
}

log_reg_cv = GridSearchCV(
    LogisticRegression(max_iter=10000),
    param_grid,
    cv=5
)

log_reg_cv.fit(X_train, y_train)

print("Best Parameters:", log_reg_cv.best_params_)


Best Parameters: {'C': 10, 'penalty': 'l2', 'solver': 'liblinear'}


In [18]:
# Evaluate best model
best_log_reg = log_reg_cv.best_estimator_

train_pred = best_log_reg.predict(X_train)
test_pred = best_log_reg.predict(X_test)

print("\nBest Logistic Regression Model")
print("Train Accuracy:", accuracy_score(y_train, train_pred))
print("Test Accuracy:", accuracy_score(y_test, test_pred))
print("Coefficients:\n", best_log_reg.coef_)



Best Logistic Regression Model
Train Accuracy: 0.9692307692307692
Test Accuracy: 0.956140350877193
Coefficients:
 [[ 4.48835633  0.27196032 -0.51946446 -0.0074426  -0.72144586 -0.69521094
  -1.74176288 -1.6439379  -0.89114892  0.03691798 -0.31545108  3.35632861
  -0.88727336 -0.07335493 -0.11096036  0.80035637  0.92249598 -0.11617823
  -0.02500433  0.14611177  0.51777817 -0.6428525   0.16484437 -0.02768451
  -1.40296201 -1.61471735 -2.97856699 -2.6019605  -2.99483671 -0.02661842]]
