In [108]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_boston

In [109]:
boston_dataset = load_boston()

In [110]:
print (boston_dataset['DESCR'])

Boston House Prices dataset

Notes
------
Data Set Characteristics:  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive
    
    :Median Value (attribute 14) is usually the target

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher ratio by town
      

In [111]:
#builtin fn to print feature names
print (boston_dataset['feature_names'])

['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO'
 'B' 'LSTAT']


-----

In [112]:
df_boston = pd.DataFrame(boston_dataset.data)
df_boston.columns = boston_dataset.feature_names

In [113]:
df_boston.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [114]:
df_boston['PRICE'] = boston_dataset.target

In [115]:
df_boston.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [116]:
X_data = boston_dataset.data
y_data = boston_dataset.target

In [117]:
from sklearn.linear_model import LinearRegression

In [118]:
linReg = LinearRegression()

In [119]:
linReg.fit(X_data,y_data)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [120]:
print('Intercept is %.2f'%linReg.intercept_)

Intercept is 36.49


In [121]:
print('The coefficient is %.2f'%len(linReg.coef_))

The coefficient is 13.00


In [122]:
#split the dataset into train and test data
from sklearn.cross_validation import train_test_split

In [123]:
X_train,X_test,y_train,y_test = train_test_split(X_data,y_data,test_size=0.3,random_state=0)

In [124]:
X_train.shape

(354, 13)

In [125]:
y_train.shape

(354,)

In [126]:
X_test.shape

(152, 13)

In [127]:
y_test.shape

(152,)

In [128]:
linReg.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [129]:
Predict_test = linReg.predict(X_test)

In [130]:
Predict_test

array([24.93551831, 23.75668597, 29.3364008 , 11.99898444, 21.37583999,
       19.19718511, 20.57022126, 21.2138302 , 19.05187659, 20.31028442,
        5.47685057, 16.88415507, 17.13177611,  5.41132187, 40.2160287 ,
       32.30923608, 22.46445111, 36.50566714, 31.03913253, 23.17552674,
       24.75105205, 24.51122436, 20.65675756, 30.45679279, 22.33344401,
       10.18647997, 17.44394817, 18.24663845, 35.62978156, 20.81890427,
       18.26969192, 17.71558071, 19.34036094, 23.62642525, 28.98283423,
       19.43835179, 11.14022634, 24.82272051, 18.00566388, 15.57161273,
       26.2207858 , 20.81349155, 22.17395321, 15.48276713, 22.6264291 ,
       24.89397648, 19.75674027, 23.03741163,  9.84032063, 24.36186515,
       21.43835984, 17.61122699, 24.39248313, 29.93655984, 13.55766168,
       21.53449652, 20.53870439, 15.03422398, 14.34404261, 22.12289936,
       17.07752265, 21.54237223, 32.96641318, 31.37158339, 17.79556174,
       32.75501672, 18.72716141, 19.20885247, 19.3875019 , 23.08

In [131]:
y_test

array([22.6, 50. , 23. ,  8.3, 21.2, 19.9, 20.6, 18.7, 16.1, 18.6,  8.8,
       17.2, 14.9, 10.5, 50. , 29. , 23. , 33.3, 29.4, 21. , 23.8, 19.1,
       20.4, 29.1, 19.3, 23.1, 19.6, 19.4, 38.7, 18.7, 14.6, 20. , 20.5,
       20.1, 23.6, 16.8,  5.6, 50. , 14.5, 13.3, 23.9, 20. , 19.8, 13.8,
       16.5, 21.6, 20.3, 17. , 11.8, 27.5, 15.6, 23.1, 24.3, 42.8, 15.6,
       21.7, 17.1, 17.2, 15. , 21.7, 18.6, 21. , 33.1, 31.5, 20.1, 29.8,
       15.2, 15. , 27.5, 22.6, 20. , 21.4, 23.5, 31.2, 23.7,  7.4, 48.3,
       24.4, 22.6, 18.3, 23.3, 17.1, 27.9, 44.8, 50. , 23. , 21.4, 10.2,
       23.3, 23.2, 18.9, 13.4, 21.9, 24.8, 11.9, 24.3, 13.8, 24.7, 14.1,
       18.7, 28.1, 19.8, 26.7, 21.7, 22. , 22.9, 10.4, 21.9, 20.6, 26.4,
       41.3, 17.2, 27.1, 20.4, 16.5, 24.4,  8.4, 23. ,  9.7, 50. , 30.5,
       12.3, 19.4, 21.2, 20.3, 18.8, 33.4, 18.5, 19.6, 33.2, 13.1,  7.5,
       13.6, 17.4,  8.4, 35.4, 24. , 13.4, 26.2,  7.2, 13.1, 24.5, 37.2,
       25. , 24.1, 16.6, 32.9, 36.2, 11. ,  7.2, 22

In [132]:
diff = Predict_test - y_test
diff

array([  2.33551831, -26.24331403,   6.3364008 ,   3.69898444,
         0.17583999,  -0.70281489,  -0.02977874,   2.5138302 ,
         2.95187659,   1.71028442,  -3.32314943,  -0.31584493,
         2.23177611,  -5.08867813,  -9.7839713 ,   3.30923608,
        -0.53554889,   3.20566714,   1.63913253,   2.17552674,
         0.95105205,   5.41122436,   0.25675756,   1.35679279,
         3.03344401, -12.91352003,  -2.15605183,  -1.15336155,
        -3.07021844,   2.11890427,   3.66969192,  -2.28441929,
        -1.15963906,   3.52642525,   5.38283423,   2.63835179,
         5.54022634, -25.17727949,   3.50566388,   2.27161273,
         2.3207858 ,   0.81349155,   2.37395321,   1.68276713,
         6.1264291 ,   3.29397648,  -0.54325973,   6.03741163,
        -1.95967937,  -3.13813485,   5.83835984,  -5.48877301,
         0.09248313, -12.86344016,  -2.04233832,  -0.16550348,
         3.43870439,  -2.16577602,  -0.65595739,   0.42289936,
        -1.52247735,   0.54237223,  -0.13358682,  -0.12

In [133]:
sq_err = diff**2
sq_err

array([5.45464579e+00, 6.88711531e+02, 4.01499751e+01, 1.36824859e+01,
       3.09197008e-02, 4.93948775e-01, 8.86773577e-04, 6.31934230e+00,
       8.71357540e+00, 2.92507279e+00, 1.10433221e+01, 9.97580186e-02,
       4.98082461e+00, 2.58946451e+01, 9.57260943e+01, 1.09510435e+01,
       2.86812608e-01, 1.02763018e+01, 2.68675547e+00, 4.73291660e+00,
       9.04499996e-01, 2.92813491e+01, 6.59244424e-02, 1.84088668e+00,
       9.20178254e+00, 1.66759000e+02, 4.64855951e+00, 1.33024287e+00,
       9.42624128e+00, 4.48975530e+00, 1.34666388e+01, 5.21857148e+00,
       1.34476276e+00, 1.24356750e+01, 2.89749044e+01, 6.96090019e+00,
       3.06941079e+01, 6.33895403e+02, 1.22896792e+01, 5.16022441e+00,
       5.38604674e+00, 6.61768508e-01, 5.63565383e+00, 2.83170522e+00,
       3.75331335e+01, 1.08502811e+01, 2.95131132e-01, 3.64503392e+01,
       3.84034323e+00, 9.84789035e+00, 3.40864456e+01, 3.01266291e+01,
       8.55313017e-03, 1.65468093e+02, 4.17114582e+00, 2.73914014e-02,
      

In [134]:
MSE=np.mean(sq_err)

In [135]:
print('MSE is %.2f'%MSE)

MSE is 27.18


In [136]:
#IN one line :calculate the MSE using predict method
print('MSE = %.2f'%np.mean((linReg.predict(X_test)-y_test)**2))

MSE = 27.18


In [137]:
#variance
print('The variance is %.3f'%linReg.score(X_test,y_test))

The variance is 0.674
