![Image of house](https://platinumrealestate.vegas/wp-content/uploads/2015/12/home-for-sale-830x323.jpg)


## We are going to implement a very simple machine learning model to forecast housing prices in the city of Boston, USA.

In [1]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.cross_validation import train_test_split



In [6]:
#The Boston dataset comes included in Sklearn, so we import it direcly
from sklearn.datasets import load_boston
boston = load_boston()

In [5]:
#Let's read a bit about the dataset
boston.DESCR



In [9]:
#What are Boston's feature and response details? 
print(boston.feature_names)
print(boston.target)

['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO'
 'B' 'LSTAT']
[24.  21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 15.  18.9 21.7 20.4
 18.2 19.9 23.1 17.5 20.2 18.2 13.6 19.6 15.2 14.5 15.6 13.9 16.6 14.8
 18.4 21.  12.7 14.5 13.2 13.1 13.5 18.9 20.  21.  24.7 30.8 34.9 26.6
 25.3 24.7 21.2 19.3 20.  16.6 14.4 19.4 19.7 20.5 25.  23.4 18.9 35.4
 24.7 31.6 23.3 19.6 18.7 16.  22.2 25.  33.  23.5 19.4 22.  17.4 20.9
 24.2 21.7 22.8 23.4 24.1 21.4 20.  20.8 21.2 20.3 28.  23.9 24.8 22.9
 23.9 26.6 22.5 22.2 23.6 28.7 22.6 22.  22.9 25.  20.6 28.4 21.4 38.7
 43.8 33.2 27.5 26.5 18.6 19.3 20.1 19.5 19.5 20.4 19.8 19.4 21.7 22.8
 18.8 18.7 18.5 18.3 21.2 19.2 20.4 19.3 22.  20.3 20.5 17.3 18.8 21.4
 15.7 16.2 18.  14.3 19.2 19.6 23.  18.4 15.6 18.1 17.4 17.1 13.3 17.8
 14.  14.4 13.4 15.6 11.8 13.8 15.6 14.6 17.8 15.4 21.5 19.6 15.3 19.4
 17.  15.6 13.1 41.3 24.3 23.3 27.  50.  50.  50.  22.7 25.  50.  23.8
 23.8 22.3 17.4 19.1 23.1 23.6 22.6 29.4 23.2 24.6 29.9 37.2 

In [13]:
#Setting up the data as dataframes
df_X = pd.DataFrame(boston.data, columns = boston.feature_names)
df_y = pd.DataFrame(boston.target)

In [14]:
#Now we have a look at the data in the form of a Pandas dataframe
df_X.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.593761,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063
std,8.596783,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,391.44,11.36
75%,3.647423,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,396.225,16.955
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97


In [15]:
#Now we'll apply regression to the problem
reg = linear_model.LinearRegression()

In [16]:
#Now we split our data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(df_x, df_y, test_size = 0.2, random_state = 4)

In [17]:
#Now we fit the model to the data
reg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [19]:
#now that we fit the model, we can see the optimal weights applied 
reg.coef_

array([[-1.14743504e-01,  4.70875035e-02,  8.70282354e-03,
         3.23818824e+00, -1.67240567e+01,  3.87662996e+00,
        -1.08218769e-02, -1.54144627e+00,  2.92604151e-01,
        -1.33989537e-02, -9.07306805e-01,  8.91271054e-03,
        -4.58747039e-01]])

In [40]:
#and now we do the prediction
pred = reg.predict(X_test)

In [47]:
#Now we compare the prediction for the first 4 predicted values with the actual values on the test table
print(pred[0], pred[1], pred[2], pred[3])
print(y_test)

[12.06508881] [26.98544801] [17.59242607] [18.15842166]
        0
8    16.5
289  24.8
68   17.4
211  19.3
226  37.6
70   24.2
55   35.4
470  19.9
409  27.5
154  17.0
344  31.2
272  24.4
310  16.1
160  27.0
319  21.0
454  14.9
11   18.9
399   6.3
413  16.3
25   13.9
418   8.8
153  19.4
124  18.8
108  19.8
345  17.5
103  19.3
209  20.0
129  14.3
432  16.1
106  19.5
..    ...
63   25.0
239  23.3
123  17.3
64   33.0
139  17.8
314  23.8
427  10.9
102  18.6
45   19.3
476  16.7
80   28.0
14   18.2
278  29.1
505  11.9
341  32.7
497  18.3
212  22.4
280  45.4
229  31.5
203  48.5
317  19.8
232  41.7
327  22.2
309  20.3
269  20.7
368  50.0
144  11.8
336  19.5
437   8.7
216  23.3

[102 rows x 1 columns]


In [27]:
#now we check the mean square error
np.mean((pred-y_test)**2)

0    25.407977
dtype: float64

## Since this is a basic implementation example of a simple predictive model (LinReg), we must note that we can get more precise results varying the parameters of the model, or using a more precise model for this task.
As we use better models our mean square error will go down over time