# Build the linear regression model using scikit learn in boston data to predict 'Price' based on other dependent variable.

In [37]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import sklearn
from sklearn.datasets import load_boston
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

In [2]:
boston = load_boston()

In [3]:
bos = pd.DataFrame(boston.data,columns=['A', 'B', 'C', 'D','E','F','G','H','I','J','K','L','Price'])
bos.head()

Unnamed: 0,A,B,C,D,E,F,G,H,I,J,K,L,Price
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [4]:
bos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 13 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A       506 non-null    float64
 1   B       506 non-null    float64
 2   C       506 non-null    float64
 3   D       506 non-null    float64
 4   E       506 non-null    float64
 5   F       506 non-null    float64
 6   G       506 non-null    float64
 7   H       506 non-null    float64
 8   I       506 non-null    float64
 9   J       506 non-null    float64
 10  K       506 non-null    float64
 11  L       506 non-null    float64
 12  Price   506 non-null    float64
dtypes: float64(13)
memory usage: 51.5 KB


In [5]:
bos.shape

(506, 13)

# Preparing X and Y

In [6]:
x= bos.drop(["Price"], axis = 1)

In [7]:
x.head()

Unnamed: 0,A,B,C,D,E,F,G,H,I,J,K,L
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9


In [8]:
y = bos["Price"]

In [9]:
y.head()

0    4.98
1    9.14
2    4.03
3    2.94
4    5.33
Name: Price, dtype: float64

In [10]:
x.shape

(506, 12)

In [11]:
y.shape

(506,)

# Splitting Data into Training and Testing Sets

In [12]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3 , random_state=100)

In [13]:
x_train.head()

Unnamed: 0,A,B,C,D,E,F,G,H,I,J,K,L
463,5.82115,0.0,18.1,0.0,0.713,6.513,89.9,2.8016,24.0,666.0,20.2,393.82
75,0.09512,0.0,12.83,0.0,0.437,6.286,45.0,4.5026,5.0,398.0,18.7,383.23
478,10.233,0.0,18.1,0.0,0.614,6.185,96.7,2.1705,24.0,666.0,20.2,379.7
199,0.0315,95.0,1.47,0.0,0.403,6.975,15.3,7.6534,3.0,402.0,17.0,396.9
84,0.05059,0.0,4.49,0.0,0.449,6.389,48.0,4.7794,3.0,247.0,18.5,396.9


In [14]:
y_train.head()

463    10.29
75      8.94
478    18.03
199     4.56
84      9.62
Name: Price, dtype: float64

In [15]:
x_test.head()

Unnamed: 0,A,B,C,D,E,F,G,H,I,J,K,L
198,0.03768,80.0,1.52,0.0,0.404,7.274,38.3,7.309,2.0,329.0,12.6,392.2
229,0.44178,0.0,6.2,0.0,0.504,6.552,21.4,3.3751,8.0,307.0,17.4,380.34
502,0.04527,0.0,11.93,0.0,0.573,6.12,76.7,2.2875,1.0,273.0,21.0,396.9
31,1.35472,0.0,8.14,0.0,0.538,6.072,100.0,4.175,4.0,307.0,21.0,376.73
315,0.25356,0.0,9.9,0.0,0.544,5.705,77.7,3.945,4.0,304.0,18.4,396.42


In [16]:
y_test.head()

198     6.62
229     3.76
502     9.08
31     13.04
315    11.50
Name: Price, dtype: float64

In [18]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(354, 12)
(354,)
(152, 12)
(152,)


# Performing Linear Regression

In [19]:
# import LinearRegression from sklearn
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt

# Representing LinearRegression as lr(Creating LinearRegression Object)
lr = LinearRegression()

# Fit the model using lr.fit()
model_fit = lr.fit(x_train, y_train)

In [21]:
model_fit.predict(x_train)

array([ 1.50398464e+01,  9.35657776e+00,  1.73795062e+01,  3.14163292e+00,
        8.49754890e+00,  1.62359342e+01,  1.32622884e+01,  2.01282640e+01,
        1.28936466e+01,  8.42075313e+00,  1.20417250e+01,  1.99836164e+01,
        7.56838713e+00,  9.56616061e+00,  2.22646006e+01,  6.91501772e+00,
        1.00453948e+01,  1.78997268e+01,  1.22705367e+01,  6.84312860e+00,
        1.08590209e+01,  3.44379588e+00,  6.00964523e+00,  4.97447514e+00,
        1.21540846e+01,  1.70290725e+01,  8.67600661e+00,  1.08549920e+01,
        3.63685636e+00,  1.67441725e+01,  9.08728920e+00,  1.35108895e+01,
        7.49900360e+00,  1.56621248e+01,  9.86122588e+00,  1.65921699e+01,
        4.02884096e+00,  1.72019420e+01,  5.94272082e+00,  2.00254225e+01,
        2.77575237e+01,  1.60277621e+01,  7.13517150e+00,  1.94506246e+01,
        1.81620299e+01,  2.85804304e+01,  1.21335223e+01,  1.54224654e+01,
        1.81884479e+01,  1.78375236e+01,  9.27827701e+00,  9.91546848e+00,
        1.32326461e+01,  

In [35]:
y_pred = lr.predict(x_test)

In [39]:
print('MSE test: %.3f' % (
        #mean_squared_error(y_train, y_train_pred),
        mean_squared_error(y_test, y_pred)))
print('R^2 test: %.3f' % (
      #  r2_score(y_train, y_train_pred),
        r2_score(y_test, y_pred)))

MSE test: 17.687
R^2 test: 0.590
