# Implementing L1 and L2 regularization using Sklearn

Prerequisites: L2 and L1 regularization

This article aims to implement the L2 and L1 regularization for Linear regression using the Ridge and Lasso modules of the Sklearn library of Python.

Dataset – House prices dataset .

In [29]:
!wget https://raw.githubusercontent.com/Shreyas3108/house-price-prediction/master/kc_house_data.csv

'wget' is not recognized as an internal or external command,
operable program or batch file.


Step 1: Importing the required libraries

In [30]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.model_selection import train_test_split, cross_val_score 
from statistics import mean 
from sklearn import preprocessing

Step 2: Loading and cleaning the Data

In [31]:
# Changing the working location to the location of the data 

# Loading the data into a Pandas DataFrame 
data = pd.read_csv('kc_house_data.csv') 

In [32]:
data.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180.0,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170.0,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770.0,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050.0,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680.0,0,1987,0,98074,47.6168,-122.045,1800,7503


In [33]:
data.shape

(21613, 21)

In [34]:
data.dropna(how='any',inplace=True)

In [35]:
data.isnull().sum()

id               0
date             0
price            0
bedrooms         0
bathrooms        0
sqft_living      0
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
grade            0
sqft_above       0
sqft_basement    0
yr_built         0
yr_renovated     0
zipcode          0
lat              0
long             0
sqft_living15    0
sqft_lot15       0
dtype: int64

In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   price          1000 non-null   float64
 1   bedrooms       1000 non-null   int64  
 2   bathrooms      1000 non-null   float64
 3   sqft_living    1000 non-null   int64  
 4   sqft_lot       1000 non-null   int64  
 5   floors         1000 non-null   float64
 6   waterfront     1000 non-null   int64  
 7   view           1000 non-null   int64  
 8   condition      1000 non-null   int64  
 9   grade          1000 non-null   int64  
 10  sqft_above     998 non-null    float64
 11  sqft_basement  1000 non-null   int64  
 12  yr_built       1000 non-null   int64  
 13  yr_renovated   1000 non-null   int64  
 14  lat            1000 non-null   float64
 15  long           1000 non-null   float64
 16  sqft_living15  1000 non-null   int64  
 17  sqft_lot15     1000 non-null   int64  
dtypes: float6

In [8]:
# Dropping the numerically non-sensical variables 
dropColumns = ['id', 'date', 'zipcode'] 
data = data.drop(dropColumns, axis = 1) 
#to visualize overfitting selecting only 1000 datapoints
data = data[0:1000]

# Separating the dependent and independent variables 
y = data['price'] 
X = data.drop('price', axis = 1) 

x = X.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
X = pd.DataFrame(x_scaled)

# Dividing the data into training and testing set 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25) 

In [14]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((750, 17), (250, 17), (750,), (250,))

In [9]:
X.describe()

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,lat,long,sqft_living15,sqft_lot15
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,998.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,3.349,2.04575,2051.196,14702.085,1.4465,0.008,0.237,3.464,7.606,1750.373747,300.863,1969.049,81.749,47.549493,-122.207472,1987.077,13496.874
std,0.852012,0.721623,887.929222,28961.030775,0.517354,0.089129,0.765125,0.689332,1.16022,790.847575,450.898196,28.190873,395.57825,0.14167,0.139509,670.439353,25093.829486
min,0.0,0.0,380.0,649.0,1.0,0.0,0.0,1.0,4.0,380.0,0.0,1900.0,0.0,47.1775,-122.49,830.0,660.0
25%,3.0,1.5,1407.5,5428.5,1.0,0.0,0.0,3.0,7.0,1190.0,0.0,1952.0,0.0,47.442875,-122.32225,1490.0,5404.5
50%,3.0,2.0,1900.0,8045.0,1.0,0.0,0.0,3.0,7.0,1535.0,0.0,1974.0,0.0,47.5635,-122.218,1850.0,7995.0
75%,4.0,2.5,2472.5,11489.25,2.0,0.0,0.0,4.0,8.0,2137.5,580.0,1992.0,0.0,47.6734,-122.118,2360.0,10403.0
max,7.0,5.0,6070.0,315374.0,3.5,1.0,4.0,5.0,12.0,6070.0,2060.0,2015.0,2014.0,47.7776,-121.709,4760.0,233971.0


Step 3: Building and evaluating the different models

a) Linear Regression:

In [10]:
# Bulding and fitting the Linear Regression model 
linearModel = LinearRegression() 
linearModel.fit(X_train, y_train) 

# Evaluating the Linear Regression model 
print(linearModel.score(X_train, y_train)) 
print(linearModel.score(X_test, y_test)) 

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

b) Ridge(L2) Regression:

In [None]:
# List to maintain the different cross-validation scores 
cross_val_scores_ridge = [] 

# List to maintain the different values of alpha 
alpha = [] 
# Loop to compute the different values of cross-validation scores 
for i in range(-3, 3): 
	ridgeModel = Ridge(alpha = 10**(-i)) 
	ridgeModel.fit(X_train, y_train) 
	scores = cross_val_score(ridgeModel, X, y, cv = 10) 
	avg_cross_val_score = mean(scores)*100
	cross_val_scores_ridge.append(avg_cross_val_score) 
	alpha.append(10**(-i)) 

# Loop to print the different values of cross-validation scores 
for i in range(0, len(alpha)): 
	print(str(alpha[i])+' : '+str(cross_val_scores_ridge[i])) 


In [None]:
# List to maintain the different cross-validation scores 
cross_val_scores_ridge = [] 

# List to maintain the different values of alpha 
alpha = [] 

# Loop to compute the different values of cross-validation scores 
for i in range(1, 9): 
	ridgeModel = Ridge(alpha = i * 0.25) 
	ridgeModel.fit(X_train, y_train) 
	scores = cross_val_score(ridgeModel, X, y, cv = 10) 
	avg_cross_val_score = mean(scores)*100
	cross_val_scores_ridge.append(avg_cross_val_score) 
	alpha.append(i * 0.25) 

# Loop to print the different values of cross-validation scores 
for i in range(0, len(alpha)): 
	print(str(alpha[i])+' : '+str(cross_val_scores_ridge[i])) 


In [None]:
# Building and fitting the Ridge Regression model 
ridgeModelChosen = Ridge(alpha = 1.5) 
ridgeModelChosen.fit(X_train, y_train) 

# Evaluating the Ridge Regression model 

print(ridgeModelChosen.score(X_train, y_train)) 
print(ridgeModelChosen.score(X_test, y_test)) 

c) Lasso(L1) Regression:

In [None]:
# List to maintain the cross-validation scores 
cross_val_scores_lasso = [] 

# List to maintain the different values of Lambda 
Lambda = [] 

# Loop to compute the cross-validation scores 
for i in range(-3, 3): 
	lassoModel = Lasso(alpha = 10**(-i), tol = 0.0925) 
	lassoModel.fit(X_train, y_train) 
	scores = cross_val_score(lassoModel, X, y, cv = 10) 
	avg_cross_val_score = mean(scores)*100
	cross_val_scores_lasso.append(avg_cross_val_score) 
	Lambda.append(10**(-i)) 

# Loop to print the different values of cross-validation scores 
for i in range(-3,3): 
	print(str(10**(-i))+' : '+str(cross_val_scores_lasso[i])) 

In [None]:
# List to maintain the cross-validation scores 
cross_val_scores_lasso = [] 

# List to maintain the different values of Lambda 
Lambda = [] 

# Loop to compute the cross-validation scores 
for i in range(1, 9): 
	lassoModel = Lasso(alpha = i * 0.25, tol = 0.0925) 
	lassoModel.fit(X_train, y_train) 
	scores = cross_val_score(lassoModel, X, y, cv = 10) 
	avg_cross_val_score = mean(scores)*100
	cross_val_scores_lasso.append(avg_cross_val_score) 
	Lambda.append(i * 0.25) 

# Loop to print the different values of cross-validation scores 
for i in range(0, len(alpha)): 
	print(str(i * 0.25)+' : '+str(cross_val_scores_lasso[i])) 

In [None]:
# Building and fitting the Lasso Regression Model 
lassoModelChosen = Lasso(alpha = 10, tol = 0.0925) 
lassoModelChosen.fit(X_train, y_train) 
# Evaluating the Lasso Regression model 
print(lassoModelChosen.score(X_train, y_train)) 
print(lassoModelChosen.score(X_test, y_test)) 


In [None]:
linearModel.coef_

In [None]:
ridgeModelChosen.coef_

In [None]:
lassoModelChosen.coef_

In [None]:
np.sum(abs(0-linearModel.coef_))

In [None]:
np.sum(abs(0-ridgeModelChosen.coef_)) 

In [None]:
np.sum(abs(0-lassoModelChosen.coef_)) 

Step 4: Comparing and Visualizing the results

In [None]:
# Building the two lists for visualization 
models = ['Linear Regression', 'Ridge Regression', 'Lasso Regression'] 
scores = [linearModel.score(X_test, y_test), 
		ridgeModelChosen.score(X_test, y_test), 
		lassoModelChosen.score(X_test, y_test)] 

# Building the dictionary to compare the scores 
mapping = {} 
mapping['Linear Regreesion'] = linearModel.score(X_test, y_test) 
mapping['Ridge Regreesion'] = ridgeModelChosen.score(X_test, y_test) 
mapping['Lasso Regression'] = lassoModelChosen.score(X_test, y_test) 

# Printing the scores for different models 
for key, val in mapping.items(): 
	print(str(key)+' : '+str(val)) 


In [None]:
# Plotting the scores 
plt.bar(models, scores) 
plt.xlabel('Regression Models') 
plt.ylabel('Score') 
plt.show() 
