# Linear Regression with Regularization using Sklearn
##  CPE 490 590
### Author: Rahul Bhadani

# Let's read the data first

In [2]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split, cross_val_score
from statistics import mean
plt.rcParams['font.family'] = 'Serif'
plt.rcParams['font.size'] = 15

data = pd.read_csv("Dataset/HousePrices/kc_house_data.csv")
data.dropna(inplace=True) #remove drop na
data.head()

2024-02-21 12:51:40.942040: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-02-21 12:51:40.945456: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-02-21 12:51:40.978028: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-21 12:51:40.978050: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-21 12:51:40.978767: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180.0,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170.0,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770.0,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050.0,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680.0,0,1987,0,98074,47.6168,-122.045,1800,7503


In [3]:
# Drop the non-sensical variables that are not useful for regression
dropColumns = ['id', 'date', 'zipcode']
data = data.drop(dropColumns, axis = 1)
data.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,lat,long,sqft_living15,sqft_lot15
0,221900.0,3,1.0,1180,5650,1.0,0,0,3,7,1180.0,0,1955,0,47.5112,-122.257,1340,5650
1,538000.0,3,2.25,2570,7242,2.0,0,0,3,7,2170.0,400,1951,1991,47.721,-122.319,1690,7639
2,180000.0,2,1.0,770,10000,1.0,0,0,3,6,770.0,0,1933,0,47.7379,-122.233,2720,8062
3,604000.0,4,3.0,1960,5000,1.0,0,0,5,7,1050.0,910,1965,0,47.5208,-122.393,1360,5000
4,510000.0,3,2.0,1680,8080,1.0,0,0,3,8,1680.0,0,1987,0,47.6168,-122.045,1800,7503


# Separating the dependent and independent variables

In [4]:
y = data['price']
X = data.drop('price', axis = 1)
 
# Dividing the data into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

# Building and evaluating the different models
## Linear Regression (without Regularization)

In [5]:
linearModel = LinearRegression()
linearModel.fit(X_train, y_train)
 
# Evaluating the Linear Regression model on the test dataset
# coefficient of determination of the prediction
print(linearModel.score(X_test, y_test))

0.6882352163607928


Check the coefficient of determination of the prediction you got from the Linear Regression without regularization

## Ridge(L2) Regression:

Note: in `Ridge` python class, $\alpha$ is the regularization parameter (we had this as $\lambda$ in the lecture slide).

In [6]:
# List to maintain the different cross-validation scores
cross_val_scores_ridge = []
 
# List to maintain the different values of alpha
alpha = []

accuracy_list = []
 
# Loop to compute the different values of cross-validation scores
for i in range(1, 9):
    ridgeModel = Ridge(alpha = i * 0.25)
    ridgeModel.fit(X_train, y_train)
    scores = cross_val_score(ridgeModel, X, y, cv = 10)
    avg_cross_val_score = mean(scores)*100
    cross_val_scores_ridge.append(avg_cross_val_score)
    alpha.append(i * 0.25)
    
    accuracy_list.append(ridgeModel.score(X_test, y_test))
# Loop to print the different values of cross-validation scores
for i in range(0, len(alpha)):
    print('Lambda:' + str(alpha[i])+'. Cross validation score: '+str(cross_val_scores_ridge[i]) + '   coefficient of determination of the prediction: {}'.format(accuracy_list[i]))

Lambda:0.25. Cross validation score: 69.09015837671014   coefficient of determination of the prediction: 0.6882760577465402
Lambda:0.5. Cross validation score: 69.09033468066406   coefficient of determination of the prediction: 0.6883163042375982
Lambda:0.75. Cross validation score: 69.09049125083845   coefficient of determination of the prediction: 0.6883559625887381
Lambda:1.0. Cross validation score: 69.0906282899928   coefficient of determination of the prediction: 0.6883950394639509
Lambda:1.25. Cross validation score: 69.09074599856251   coefficient of determination of the prediction: 0.6884335414379195
Lambda:1.5. Cross validation score: 69.09084457469034   coefficient of determination of the prediction: 0.6884714749974316
Lambda:1.75. Cross validation score: 69.09092421425656   coefficient of determination of the prediction: 0.6885088465427629
Lambda:2.0. Cross validation score: 69.09098511090886   coefficient of determination of the prediction: 0.6885456623890396


Which value of Lambda gives the best accuracy?

## Lasso (L1) Regression

In [7]:
# Building and fitting the Lasso Regression Model
lassoModelChosen = Lasso(alpha = 2, tol = 0.0925)
lassoModelChosen.fit(X_train, y_train)

# Evaluating the Lasso Regression model
print(lassoModelChosen.score(X_test, y_test))


0.6882427833154532


Which Model Performed Best?