# Boston Housing Linear Regression

In [34]:
import sys
import pandas as pd
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
sys.path.append("..")


## Select the Imput File

In [35]:
inputFile = "../data/Boston_Housing_Data.csv"

## Read in data and create a DataFrame 

In [36]:
df = pd.read_csv(inputFile,delimiter=";")

print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 15 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    int64  
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    int64  
 9   TAX      506 non-null    int64  
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
 13  MEDV     506 non-null    float64
 14  CAT      506 non-null    int64  
dtypes: float64(11), int64(4)
memory usage: 59.4 KB
None


## Feature selection

In [37]:
df_features = df.drop(["MEDV","CAT"],axis=1) # drop label attribute from the features
df_labels = df[["MEDV"]].copy()
display(df_features)
display(df_labels)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296,15.3,396.90,4.98
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.90,9.14
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.90,5.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273,21.0,391.99,9.67
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273,21.0,396.90,9.08
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273,21.0,396.90,5.64
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273,21.0,393.45,6.48


Unnamed: 0,MEDV
0,24.0
1,21.6
2,34.7
3,33.4
4,36.2
...,...
501,22.4
502,20.6
503,23.9
504,22.0


## Training and test data split

In [38]:
X_train, X_test, y_train, y_test = train_test_split(df_features,df_labels,test_size=0.3,random_state=1234)
display (X_train)
display (X_test) 
display (y_train)
display (y_test)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
99,0.06860,0.0,2.89,0,0.4450,7.416,62.5,3.4952,2,276,18.0,396.90,6.19
102,0.22876,0.0,8.56,0,0.5200,6.405,85.4,2.7147,5,384,20.9,70.80,10.63
416,10.83420,0.0,18.10,0,0.6790,6.782,90.8,1.8195,24,666,20.2,21.57,25.79
266,0.78570,20.0,3.97,0,0.6470,7.014,84.6,2.1329,5,264,13.0,384.07,14.79
101,0.11432,0.0,8.56,0,0.5200,6.781,71.3,2.8561,5,384,20.9,395.58,7.67
...,...,...,...,...,...,...,...,...,...,...,...,...,...
204,0.02009,95.0,2.68,0,0.4161,8.034,31.9,5.1180,4,224,14.7,390.55,2.88
53,0.04981,21.0,5.64,0,0.4390,5.998,21.4,6.8147,4,243,16.8,396.90,8.43
294,0.08199,0.0,13.92,0,0.4370,6.009,42.3,5.5027,4,289,16.0,396.90,10.40
211,0.37578,0.0,10.59,1,0.4890,5.404,88.6,3.6650,4,277,18.6,395.24,23.98


Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
64,0.01951,17.5,1.38,0,0.4161,7.104,59.5,9.2229,3,216,18.6,393.24,8.05
100,0.14866,0.0,8.56,0,0.5200,6.727,79.9,2.7778,5,384,20.9,394.76,9.42
400,25.04610,0.0,18.10,0,0.6930,5.987,100.0,1.5888,24,666,20.2,396.90,26.77
485,3.67367,0.0,18.10,0,0.5830,6.312,51.9,3.9917,24,666,20.2,388.62,10.58
454,9.51363,0.0,18.10,0,0.7130,6.728,94.1,2.4961,24,666,20.2,6.68,18.71
...,...,...,...,...,...,...,...,...,...,...,...,...,...
314,0.36920,0.0,9.90,0,0.5440,6.567,87.3,3.6023,4,304,18.4,395.69,9.28
287,0.03871,52.5,5.32,0,0.4050,6.209,31.3,7.3172,6,293,16.6,396.90,7.14
384,20.08490,0.0,18.10,0,0.7000,4.368,91.2,1.4395,24,666,20.2,285.83,30.63
108,0.12802,0.0,8.56,0,0.5200,6.474,97.1,2.4329,5,384,20.9,395.24,12.27


Unnamed: 0,MEDV
99,33.2
102,18.6
416,7.5
266,30.7
101,26.5
...,...
204,50.0
53,23.4
294,21.7
211,19.3


Unnamed: 0,MEDV
64,33.0
100,27.5
400,5.6
485,21.2
454,14.9
...,...
314,23.8
287,23.2
384,8.8
108,19.8


## Build and train the model

In [39]:
#lr = LinearRegression(fit_intercept=False) 
#lr = Ridge(alpha=0.3,max_iter=100)
#lr= Lasso(alpha=0.01,max_iter=100)
lr = ElasticNet(alpha=0.01,max_iter=1000,l1_ratio=0.5, fit_intercept=False)
lr_model = lr.fit(X_train, y_train)

print("Coefficients: %s" % str(lr_model.coef_))
print("Intercept: %s" % str(lr_model.intercept_))

Coefficients: [-8.48272664e-02  6.62391651e-02 -9.00667307e-02  2.39404783e+00
 -7.81375208e+00  2.55349871e+00 -3.46061899e-03 -1.61002954e+00
  3.34472051e-01 -1.47073368e-02 -9.22924640e-01  9.75381423e-03
 -5.97011107e-01]
Intercept: [41.38642634]


## Test the Model

In [40]:
predictions = lr_model.predict(X_test)
print(predictions)

[21.94325253 23.93042493 12.07888275 21.46381141 14.69774121 27.11403433
 20.58850355 18.58329402 26.57353232 33.35599457 16.32366482 20.73052434
 22.69273702 15.2337338  19.05506376 22.63339431 12.18222983 17.20806252
 20.62125177  7.46576078 30.73824844 17.28748128 21.51124722 18.75452074
 28.17023762 21.52661712 23.25789726 24.67505316 21.52467405 19.45941037
 21.4520129  14.44534736 18.49355607 17.54572542 23.90467907 18.18738955
 31.39557689 21.31647459  6.74490984  7.76013804  8.51487551 20.27729063
 37.41107218 33.2753462  18.87570855 28.38161511 20.16428008 39.79545169
 15.44482059 27.1269637  20.61948336 33.33960732 32.12557286 22.89096512
 32.30774959 33.98530582 10.07506347 29.5260485  25.78768688 22.35579604
 35.01106657 17.84087835 25.77376794 32.32646085 14.47985922 20.78976186
 36.96670721 20.2890015  34.9163549  36.42513278 16.35772082 24.76902253
 11.63541213 24.09192152 23.89140432 20.19685837 37.24068975 17.78797534
 14.72872223 12.88779648 35.9224028  40.73373592 24

In [41]:
rmse = mean_squared_error(y_test,predictions)

print("root mean square error = " , rmse)


root mean square error =  23.641326510911057
