Regression Task

Task-1 (Load and Split dataset)

In [7]:
import kagglehub
# Download latest version
path = kagglehub.dataset_download("camnugent/california-housing-prices")
print("Path to dataset files:",path)
import os
import pandas as pd
import numpy as np
housing_data_path = os.path.join(path,"housing.csv")
df =pd.read_csv(housing_data_path)
df.head()
df.isnull().sum()
df.dropna(inplace=True)
df.isnull().sum()
print("After dropping null:", df.isnull().sum)
#using one hot encoding for ocean_proximity as we need numerical input
df = pd.get_dummies(df, columns=['ocean_proximity'])
df.head()


Using Colab cache for faster access to the 'california-housing-prices' dataset.
Path to dataset files: /kaggle/input/california-housing-prices
After dropping null: <bound method DataFrame.sum of        longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0          False     False               False        False           False   
1          False     False               False        False           False   
2          False     False               False        False           False   
3          False     False               False        False           False   
4          False     False               False        False           False   
...          ...       ...                 ...          ...             ...   
20635      False     False               False        False           False   
20636      False     False               False        False           False   
20637      False     False               False        False           False   
20638      Fals

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,False,False,False,True,False
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,False,False,False,True,False
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,False,False,False,True,False
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,False,False,False,True,False
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,False,False,False,True,False


In [8]:
#Defining features and target values
X = df.drop(columns='median_house_value')
y = df['median_house_value']

In [9]:
#test-train split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Training set shape:", X_train.shape)
print("Test set shape:", X_test.shape)

Training set shape: (16346, 13)
Test set shape: (4087, 13)


In [10]:
#feature scaling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled= scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

Task-2 (Regression Task)

Step-1 (Baseline Model)

In [11]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

#training the mode
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)

#making predictions
y_train_pred = lr.predict(X_train_scaled)
y_test_pred = lr.predict(X_test_scaled)

#calculating mse
train_mse_lr = mean_squared_error(y_train, y_train_pred)
test_mse_lr = mean_squared_error(y_test, y_test_pred)

print("Training MSE:", train_mse_lr)
print("Test MSE:", test_mse_lr)

Training MSE: 4690511174.839978
Test MSE: 4802173538.604161


Step-2 (Hyperparameter tuning)

In [12]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
alpha = {'alpha' : [0.01, 0.1, 1, 10, 100]}

#ridge
ridge = Ridge()
ridge_cv = GridSearchCV(ridge, alpha, cv=5, scoring='neg_mean_squared_error')
ridge_cv.fit(X_train_scaled, y_train)
best_ridge = ridge_cv.best_estimator_
print("Best Ridge alpha:", ridge_cv.best_params_)

#evaluate on test set
ridge_test_pred = best_ridge.predict(X_test_scaled)
ridge_test_mse = mean_squared_error(y_test, ridge_test_pred)
print("Ridge Test MSE:", ridge_test_mse)

#lasso
from sklearn.linear_model import Lasso
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)
lasso = Lasso(max_iter = 50000)
lasso_cv = GridSearchCV(lasso, alpha, cv=5, scoring='neg_mean_squared_error')
lasso_cv.fit(X_train_scaled, y_train)
best_lasso = lasso_cv.best_estimator_
print("Best Lasso alpha:", lasso_cv.best_params_)

#evaluate on test set
lasso_test_pred = best_lasso.predict(X_test_scaled)
lasso_test_mse = mean_squared_error(y_test, lasso_test_pred)
print("Lasso Test MSE:", lasso_test_mse)
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
alpha = {'alpha' : [0.01, 0.1, 1, 10, 100]}

#ridge
ridge = Ridge()
ridge_cv = GridSearchCV(ridge, alpha, cv=5, scoring='neg_mean_squared_error')
ridge_cv.fit(X_train_scaled, y_train)
best_ridge = ridge_cv.best_estimator_
print("Best Ridge alpha:", ridge_cv.best_params_)

#evaluate on test set
ridge_test_pred = best_ridge.predict(X_test_scaled)
ridge_test_mse = mean_squared_error(y_test, ridge_test_pred)
print("Ridge Test MSE:", ridge_test_mse)

#lasso
from sklearn.linear_model import Lasso
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)
lasso = Lasso(max_iter = 50000)
lasso_cv = GridSearchCV(lasso, alpha, cv=5, scoring='neg_mean_squared_error')
lasso_cv.fit(X_train_scaled, y_train)
best_lasso = lasso_cv.best_estimator_
print("Best Lasso alpha:", lasso_cv.best_params_)

#evaluate on test set
lasso_test_pred = best_lasso.predict(X_test_scaled)
lasso_test_mse = mean_squared_error(y_test, lasso_test_pred)
print("Lasso Test MSE:", lasso_test_mse)


Best Ridge alpha: {'alpha': 10}
Ridge Test MSE: 4802346500.283922
Best Lasso alpha: {'alpha': 100}
Lasso Test MSE: 4803641287.834215
Best Ridge alpha: {'alpha': 10}
Ridge Test MSE: 4802346500.283922
Best Lasso alpha: {'alpha': 100}
Lasso Test MSE: 4803641287.834215


Step : 3 Regularization Experiments

In [13]:
#coefficient comparision
coef_comparision = pd.DataFrame({'Feature': X.columns, 'Ridge Coeff': best_ridge.coef_, 'Lasso Coeff': best_lasso.coef_})
print(coef_comparision)
#evaluation of both models on training and testing mse
train_mse_ridge = mean_squared_error(y_train, best_ridge.predict(X_train_scaled))
test_mse_ridge = mean_squared_error(y_test, best_ridge.predict(X_test_scaled))
train_mse_lasso = mean_squared_error(y_train, best_lasso.predict(X_train_scaled))
test_mse_lasso = mean_squared_error(y_test, best_lasso.predict(X_test_scaled))
print("MSE on training ridge", train_mse_ridge)
print("MSE on training lasso", train_mse_lasso)
print("MSE on testing ridge", test_mse_ridge)
print("MSE on testing lasso", test_mse_lasso)


                       Feature   Ridge Coeff   Lasso Coeff
0                    longitude -53232.665362 -51480.325063
1                     latitude -53618.097828 -51899.939807
2           housing_median_age  13609.134817  13541.236610
3                  total_rooms -13253.091405 -11639.441066
4               total_bedrooms  42120.416466  40889.110337
5                   population -41004.936475 -40393.847989
6                   households  16738.645115  15816.950357
7                median_income  74437.435794  74077.805021
8    ocean_proximity_<1H OCEAN   6488.290585      0.000000
9       ocean_proximity_INLAND -12502.739914 -19088.976596
10      ocean_proximity_ISLAND   3077.059302   2812.053936
11    ocean_proximity_NEAR BAY   2205.465141  -1653.914518
12  ocean_proximity_NEAR OCEAN   5507.919817   1167.824032
MSE on training ridge 4690613124.205682
MSE on training lasso 4691614587.954662
MSE on testing ridge 4802346500.283922
MSE on testing lasso 4803641287.834215


Effect of regularization on the Bias-Variance Tradeoff
Regularization reduces variance by shrinking regression coefficients, thereby preventing overfitting and improving generalization, as observed through lower test MSE. Excessive regularization increases bias and leads to underfitting.

Part-2 (Classification Task)

Task-1 (Load and split the dataset)

In [14]:
from sklearn.datasets import load_breast_cancer
import pandas as pd
data = load_breast_cancer()
type(data)
data.keys()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target
df.info()
df['target'].value_counts()
# X_c= df.drop(columns = ['target'])
# y_c = df['target']


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              569 non-null    float64
 1   mean texture             569 non-null    float64
 2   mean perimeter           569 non-null    float64
 3   mean area                569 non-null    float64
 4   mean smoothness          569 non-null    float64
 5   mean compactness         569 non-null    float64
 6   mean concavity           569 non-null    float64
 7   mean concave points      569 non-null    float64
 8   mean symmetry            569 non-null    float64
 9   mean fractal dimension   569 non-null    float64
 10  radius error             569 non-null    float64
 11  texture error            569 non-null    float64
 12  perimeter error          569 non-null    float64
 13  area error               569 non-null    float64
 14  smoothness error         5

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
1,357
0,212


In [15]:
from sklearn.model_selection import train_test_split
X_c, y_c = load_breast_cancer(return_X_y=True)
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X_c, y_c, test_size=0.2, random_state=42)
print (X_train_c.shape)
print (X_test_c.shape)

(455, 30)
(114, 30)


In [16]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_c_scaled = scaler.fit_transform(X_train_c)
X_test_c_scaled = scaler.transform(X_test_c)

Task-2 (Classification task)

Step-1 (Baseline model)

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

#training the model
log = LogisticRegression(max_iter = 1000)
log.fit(X_train_c_scaled, y_train_c)

feature_names_c = data.feature_names

#making predictions
y_train_c_pred = log.predict(X_train_c_scaled)
y_test_c_pred = log.predict(X_test_c_scaled)

#model coefficients
coef_c = pd.DataFrame({
    'feature': feature_names_c,
    'coefficient': log.coef_[0]}).sort_values(by='coefficient',ascending = False)

coef_c.head()

#computing accuracy
train_accuracy_c = accuracy_score(y_train_c, y_train_c_pred)
print("Training Accuracy:", train_accuracy_c)

test_accuracy_c = accuracy_score(y_test_c, y_test_c_pred)
print("Test Accuracy:", test_accuracy_c)


Training Accuracy: 0.9868131868131869
Test Accuracy: 0.9736842105263158


Step-2 (Hyperparameter Tuning)

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

param_grid_c = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}

# ridge
log_ridge_c = LogisticRegression(penalty='l2',solver='liblinear',max_iter=1000)

# GridSearchCV
log_ridge_cv = GridSearchCV(estimator=log_ridge_c,param_grid=param_grid_c,cv=5, scoring='accuracy')
log_ridge_cv.fit(X_train_c_scaled, y_train_c)
best_log_ridge_c = log_ridge_cv.best_estimator_
print("Best ridge parameter:", log_ridge_cv.best_params_)

#lasso
log_lasso_c = LogisticRegression(penalty='l1',solver='liblinear',max_iter=1000)

# GridSearchCV
log_lasso_cv = GridSearchCV(estimator=log_lasso_c,param_grid=param_grid_c,cv=5,scoring='accuracy')
log_lasso_cv.fit(X_train_c_scaled, y_train_c)
best_log_lasso_c = log_lasso_cv.best_estimator_
print("Best Lasso parameter:", log_lasso_cv.best_params_)

Best ridge parameter: {'C': 0.1}
Best Lasso parameter: {'C': 1}


Step-3 (Regularization Experiments)

In [19]:
#evaluating on accuracy ridge:
y_test_c_pred_ridge = best_log_ridge_c.predict(X_test_c_scaled)
test_accuracy_ridge_c = accuracy_score(y_test_c, y_test_c_pred_ridge)
print("Ridge Test Accuracy:", test_accuracy_ridge_c)
y_train_c_pred_ridge = best_log_ridge_c.predict(X_train_c_scaled)
train_accuracy_ridge_c = accuracy_score(y_train_c, y_train_c_pred_ridge)
print("Ridge Train Accuracy:", train_accuracy_ridge_c)

#evaluating on accuracy lasso:
y_test_c_pred_lasso = best_log_lasso_c.predict(X_test_c_scaled)
test_accuracy_lasso_c = accuracy_score(y_test_c, y_test_c_pred_lasso)
print("Lasso Test Accuracy:", test_accuracy_lasso_c)
y_train_c_pred_lasso = best_log_lasso_c.predict(X_train_c_scaled)
train_accuracy_lasso_c = accuracy_score(y_train_c, y_train_c_pred_lasso)
print("Lasso Train Accuracy:", train_accuracy_lasso_c)


Ridge Test Accuracy: 0.9912280701754386
Ridge Train Accuracy: 0.9824175824175824
Lasso Test Accuracy: 0.9736842105263158
Lasso Train Accuracy: 0.989010989010989


Effect of Regularization on the Bias-Variance Tradeoff
Regularization reduces variance by constraining model coefficients, preventing overfitting and improving classification accuracy on unseen data. However, overly strong regularization increases bias and may reduce accuracy.