### Libraries Imported

In [23]:
import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression

from sklearn import metrics
from sklearn.metrics import r2_score
from numpy import sqrt

from sklearn.linear_model import Lasso

### Reading data from CSV
<p>Removing unnecessary columns</p>
<p>Renaming column-names to something meaningful</p>

In [2]:
column_names = ["CRIM", "ZN", "INDUS", "CHAS", "NOX", "RM", "AGE", "DIS", "RAD", "TAX", "PTRATIO", "B", "LSTAT", "PRICE"]
data = pd.read_csv("../input/boston-housing.csv", header=None, delimiter=r"\s+", names=column_names)

print(data.head())

      CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD    TAX  \
0  0.00632  18.0   2.31     0  0.538  6.575  65.2  4.0900    1  296.0   
1  0.02731   0.0   7.07     0  0.469  6.421  78.9  4.9671    2  242.0   
2  0.02729   0.0   7.07     0  0.469  7.185  61.1  4.9671    2  242.0   
3  0.03237   0.0   2.18     0  0.458  6.998  45.8  6.0622    3  222.0   
4  0.06905   0.0   2.18     0  0.458  7.147  54.2  6.0622    3  222.0   

   PTRATIO       B  LSTAT  PRICE  
0     15.3  396.90   4.98   24.0  
1     17.8  396.90   9.14   21.6  
2     17.8  392.83   4.03   34.7  
3     18.7  394.63   2.94   33.4  
4     18.7  396.90   5.33   36.2  


In [3]:
X = data.drop('PRICE',axis=1)
Y = data['PRICE']
print(X.head())
print(Y.head())

      CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD    TAX  \
0  0.00632  18.0   2.31     0  0.538  6.575  65.2  4.0900    1  296.0   
1  0.02731   0.0   7.07     0  0.469  6.421  78.9  4.9671    2  242.0   
2  0.02729   0.0   7.07     0  0.469  7.185  61.1  4.9671    2  242.0   
3  0.03237   0.0   2.18     0  0.458  6.998  45.8  6.0622    3  222.0   
4  0.06905   0.0   2.18     0  0.458  7.147  54.2  6.0622    3  222.0   

   PTRATIO       B  LSTAT  
0     15.3  396.90   4.98  
1     17.8  396.90   9.14  
2     17.8  392.83   4.03  
3     18.7  394.63   2.94  
4     18.7  396.90   5.33  
0    24.0
1    21.6
2    34.7
3    33.4
4    36.2
Name: PRICE, dtype: float64


### Test/Train Split
<p>Dividing data into test-train sets, 30% and 70%</p>

In [4]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)
print(X_test.shape, Y_test.shape)

(152, 13) (152,)


### Linear Regression
<p>Fit the model according to "data" variable obtained from CSV.</p>

In [5]:
lr = LinearRegression() 
lr.fit(X_train, Y_train)

Y_pred = lr.predict(X_test)

### Linear Regression Model metrics

In [6]:
print('Mean absolute error : ', metrics.mean_absolute_error(Y_test,Y_pred))
print('Mean square error : ', metrics.mean_squared_error(Y_test,Y_pred))
print('R squared error', r2_score(Y_test,Y_pred))
print('RMSE', sqrt(metrics.mean_squared_error(Y_test,Y_pred)))

Mean absolute error :  3.788363859662725
Mean square error :  36.16647935808379
R squared error 0.6115168603752031
RMSE 6.013857277827916


### Lasso Regression

In [18]:
lr = Lasso(alpha=0.01)
lr.fit(X_train, Y_train)

Y_predRR = lr.predict(X_test)

In [19]:
print('Mean absolute error : ', metrics.mean_absolute_error(Y_test,Y_predRR))
print('Mean square error : ', metrics.mean_squared_error(Y_test,Y_predRR))
print('R squared error', r2_score(Y_test,Y_predRR))
print('RMSE', sqrt(metrics.mean_squared_error(Y_test,Y_predRR)))

Mean absolute error :  3.789767179243429
Mean square error :  36.673700861428955
R squared error 0.6060685279524152
RMSE 6.0558815098570875


### Before and after metrics
Comparing the performance before and after applying L2 regularization.

In [20]:
train_score=lr.score(X_train, Y_train)
test_score=lr.score(X_test, Y_test)

Lasso_train_score = lr.score(X_train, Y_train)
Lasso_test_score = lr.score(X_test, Y_test)

In [21]:
print("Linear regression train score:", train_score)
print("Linear regression test score:", test_score)
print("Lasso regression train score:", Lasso_train_score)
print("Lasso regression test score:", Lasso_test_score)

Linear regression train score: 0.7822372329388865
Linear regression test score: 0.6060685279524152
Lasso regression train score: 0.7822372329388865
Lasso regression test score: 0.6060685279524152
