# Boston Housing Price dataset for analysis on House Prices


In [3]:
# Importing the modules

import tensorflow as tf
import pandas as pd
import numpy as np
%matplotlib inline

import seaborn as sb

In [4]:
import tensorflow as tf
tf.__version__

'2.0.0'

In [5]:
# Load the dataset

data = tf.keras.datasets.boston_housing

(x_train,y_train),(x_test,y_test)= data.load_data()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/boston_housing.npz


In [6]:
column_names = ['crime', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
                'TAX', 'PTRATIO', 'B', 'LSTAT']

df_xtrain = pd.DataFrame(x_train,columns=column_names)

df_xtrain.head()

Unnamed: 0,crime,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,1.23247,0.0,8.14,0.0,0.538,6.142,91.7,3.9769,4.0,307.0,21.0,396.9,18.72
1,0.02177,82.5,2.03,0.0,0.415,7.61,15.7,6.27,2.0,348.0,14.7,395.38,3.11
2,4.89822,0.0,18.1,0.0,0.631,4.97,100.0,1.3325,24.0,666.0,20.2,375.52,3.26
3,0.03961,0.0,5.19,0.0,0.515,6.037,34.5,5.9853,5.0,224.0,20.2,396.9,8.01
4,3.69311,0.0,18.1,0.0,0.713,6.376,88.4,2.5671,24.0,666.0,20.2,391.43,14.65


This data frame contains the following columns:

**

crim per capita crime rate by town.

zn proportion of residential land zoned for lots over 25,000 sq.ft.

indus proportion of non-retail business acres per town.

chas Charles River dummy variable (= 1 if tract bounds river; 0 otherwise).

nox nitrogen oxides concentration (parts per 10 million).

rm average number of rooms per dwelling.

age proportion of owner-occupied units built prior to 1940.

dis weighted mean of distances to five Boston employment centres.

rad index of accessibility to radial highways.

tax full-value property-tax rate per \$10,000.

ptratio pupil-teacher ratio by town.

black 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town.

lstat lower status of the population (percent).

medv median value of owner-occupied homes in \$1000s.

**

In [7]:
# standardized/normalized data

mean=x_train.mean(axis=0)

std=x_train.std(axis=0)

x_train2=(x_train-mean)/std

df_train_norm = pd.DataFrame(x_train2, columns=column_names)

df_train_norm.head()

Unnamed: 0,crime,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,-0.272246,-0.483615,-0.435762,-0.256833,-0.165227,-0.176443,0.813062,0.116698,-0.626249,-0.59517,1.1485,0.448077,0.82522
1,-0.403427,2.991784,-1.333912,-0.256833,-1.215182,1.894346,-1.910361,1.247585,-0.856463,-0.348433,-1.718189,0.431906,-1.329202
2,0.12494,-0.483615,1.028326,-0.256833,0.628642,-1.829688,1.110488,-1.187439,1.675886,1.565287,0.784476,0.220617,-1.3085
3,-0.401494,-0.483615,-0.869402,-0.256833,-0.36156,-0.324558,-1.236672,1.10718,-0.511142,-1.094663,0.784476,0.448077,-0.652926
4,-0.005634,-0.483615,1.028326,-0.256833,1.328612,0.153642,0.694808,-0.578572,1.675886,1.565287,0.784476,0.389882,0.263497


In [8]:
print(x_train.shape)
print(y_train.shape)

(404, 13)
(404,)


In [9]:
# combining x_train and y_train for fidning the corealation between features and sales

training_data=np.insert(arr=x_train,values=y_train,axis=1,obj=13)

column_names = ['crime' ,'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
                'TAX', 'PTRATIO', 'B', 'LSTAT','price']

df_train = pd.DataFrame(training_data,columns=column_names)

df_train.head()

Unnamed: 0,crime,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,price
0,1.23247,0.0,8.14,0.0,0.538,6.142,91.7,3.9769,4.0,307.0,21.0,396.9,18.72,15.2
1,0.02177,82.5,2.03,0.0,0.415,7.61,15.7,6.27,2.0,348.0,14.7,395.38,3.11,42.3
2,4.89822,0.0,18.1,0.0,0.631,4.97,100.0,1.3325,24.0,666.0,20.2,375.52,3.26,50.0
3,0.03961,0.0,5.19,0.0,0.515,6.037,34.5,5.9853,5.0,224.0,20.2,396.9,8.01,21.1
4,3.69311,0.0,18.1,0.0,0.713,6.376,88.4,2.5671,24.0,666.0,20.2,391.43,14.65,17.7


Another more easy way of finding correlation is this:

Downhill (negative / positive) linear relationship

Uphill (negative / positive) linear relationship

In [10]:
df = df_train.copy()

# Compute pairwise correlation of columns

df.corr()

Unnamed: 0,crime,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,price
crime,1.0,-0.192179,0.397419,-0.050828,0.405765,-0.217597,0.34441,-0.37859,0.609689,0.575652,0.273447,-0.390613,0.434384,-0.378498
ZN,-0.192179,1.0,-0.533823,-0.041981,-0.521713,0.338683,-0.578728,0.650787,-0.311091,-0.303522,-0.403139,0.176006,-0.415237,0.380299
INDUS,0.397419,-0.533823,1.0,0.052839,0.7742,-0.409924,0.65635,-0.725155,0.599226,0.701362,0.379284,-0.372885,0.603129,-0.476743
CHAS,-0.050828,-0.041981,0.052839,1.0,0.079803,0.040431,0.080488,-0.083101,-0.024851,-0.051343,-0.122008,0.037832,-0.011017,0.168661
NOX,0.405765,-0.521713,0.7742,0.079803,1.0,-0.335866,0.729338,-0.777062,0.616535,0.673471,0.18816,-0.409479,0.592994,-0.438328
RM,-0.217597,0.338683,-0.409924,0.040431,-0.335866,1.0,-0.240875,0.23397,-0.24399,-0.307904,-0.367256,0.145525,-0.610844,0.681483
AGE,0.34441,-0.578728,0.65635,0.080488,0.729338,-0.240875,1.0,-0.76667,0.462188,0.512746,0.282193,-0.278403,0.590898,-0.364173
DIS,-0.37859,0.650787,-0.725155,-0.083101,-0.777062,0.23397,-0.76667,1.0,-0.511179,-0.543668,-0.243067,0.295995,-0.507075,0.2539
RAD,0.609689,-0.311091,0.599226,-0.024851,0.616535,-0.24399,0.462188,-0.511179,1.0,0.922676,0.449908,-0.478245,0.49025,-0.375515
TAX,0.575652,-0.303522,0.701362,-0.051343,0.673471,-0.307904,0.512746,-0.543668,0.922676,1.0,0.440499,-0.471777,0.534752,-0.448737


so we can see that a few columns are primary affecting the price of the house

In [11]:
print(x_train.shape)
print(y_train.shape)
print(training_data.shape)
print(x_test.shape)
print(y_test.shape)

(404, 13)
(404,)
(404, 14)
(102, 13)
(102,)


In [14]:
Epochs=200

Loss="mse"

model=tf.keras.models.Sequential()

model.add(tf.keras.layers.Dense(128,activation=tf.nn.relu))

model.add(tf.keras.layers.Dense(64,activation=tf.nn.relu))

model.add(tf.keras.layers.Dense(1))

model.compile(optimizer='rmsprop',loss=Loss,metrics=['mae'])

history=model.fit(x_train,y_train,epochs=Epochs,validation_split=0.2,verbose=2)



Train on 323 samples, validate on 81 samples
Epoch 1/200
323/323 - 1s - loss: 703.5899 - mae: 16.3113 - val_loss: 142.4811 - val_mae: 7.7937
Epoch 2/200
323/323 - 0s - loss: 139.8305 - mae: 9.5984 - val_loss: 199.7269 - val_mae: 12.4697
Epoch 3/200
323/323 - 0s - loss: 174.5534 - mae: 11.3023 - val_loss: 193.2601 - val_mae: 12.2388
Epoch 4/200
323/323 - 0s - loss: 139.6899 - mae: 9.1923 - val_loss: 83.2218 - val_mae: 6.7153
Epoch 5/200
323/323 - 0s - loss: 147.2872 - mae: 9.9201 - val_loss: 522.4888 - val_mae: 20.0849
Epoch 6/200
323/323 - 0s - loss: 171.5932 - mae: 9.4899 - val_loss: 158.0215 - val_mae: 11.3560
Epoch 7/200
323/323 - 0s - loss: 160.0209 - mae: 10.4943 - val_loss: 422.2212 - val_mae: 19.3266
Epoch 8/200
323/323 - 0s - loss: 142.2727 - mae: 9.1833 - val_loss: 280.2575 - val_mae: 14.2695
Epoch 9/200
323/323 - 0s - loss: 107.7481 - mae: 8.0523 - val_loss: 86.6481 - val_mae: 7.8007
Epoch 10/200
323/323 - 0s - loss: 186.9567 - mae: 10.4615 - val_loss: 222.4544 - val_mae: 12.

Epoch 87/200
323/323 - 0s - loss: 36.7117 - mae: 4.4967 - val_loss: 57.0832 - val_mae: 5.2120
Epoch 88/200
323/323 - 0s - loss: 31.3868 - mae: 4.1238 - val_loss: 33.1440 - val_mae: 4.0149
Epoch 89/200
323/323 - 0s - loss: 44.7081 - mae: 5.0175 - val_loss: 44.4883 - val_mae: 4.3827
Epoch 90/200
323/323 - 0s - loss: 27.9247 - mae: 3.8694 - val_loss: 51.0696 - val_mae: 6.1109
Epoch 91/200
323/323 - 0s - loss: 42.1326 - mae: 5.0074 - val_loss: 46.0293 - val_mae: 4.5336
Epoch 92/200
323/323 - 0s - loss: 33.0598 - mae: 4.1806 - val_loss: 45.3501 - val_mae: 4.6371
Epoch 93/200
323/323 - 0s - loss: 33.7094 - mae: 4.2465 - val_loss: 36.1778 - val_mae: 4.2001
Epoch 94/200
323/323 - 0s - loss: 46.8934 - mae: 5.4861 - val_loss: 56.5195 - val_mae: 6.5581
Epoch 95/200
323/323 - 0s - loss: 33.5823 - mae: 4.4059 - val_loss: 36.4666 - val_mae: 4.8025
Epoch 96/200
323/323 - 0s - loss: 44.1954 - mae: 4.8126 - val_loss: 45.6298 - val_mae: 5.8134
Epoch 97/200
323/323 - 0s - loss: 37.1207 - mae: 4.5877 - va

Epoch 174/200
323/323 - 0s - loss: 18.2317 - mae: 3.1036 - val_loss: 20.5311 - val_mae: 3.3383
Epoch 175/200
323/323 - 0s - loss: 17.6954 - mae: 3.1519 - val_loss: 26.1130 - val_mae: 4.1567
Epoch 176/200
323/323 - 0s - loss: 23.3527 - mae: 3.5697 - val_loss: 36.4252 - val_mae: 5.0604
Epoch 177/200
323/323 - 0s - loss: 18.3225 - mae: 2.9623 - val_loss: 47.8225 - val_mae: 5.8023
Epoch 178/200
323/323 - 0s - loss: 25.4114 - mae: 3.7382 - val_loss: 26.0048 - val_mae: 4.2728
Epoch 179/200
323/323 - 0s - loss: 19.5144 - mae: 3.2163 - val_loss: 21.0023 - val_mae: 3.5963
Epoch 180/200
323/323 - 0s - loss: 20.6840 - mae: 3.3801 - val_loss: 23.9105 - val_mae: 3.5294
Epoch 181/200
323/323 - 0s - loss: 16.6781 - mae: 3.0055 - val_loss: 17.5229 - val_mae: 3.3642
Epoch 182/200
323/323 - 0s - loss: 25.8177 - mae: 3.6448 - val_loss: 33.2265 - val_mae: 4.6651
Epoch 183/200
323/323 - 0s - loss: 19.9072 - mae: 3.0897 - val_loss: 18.1756 - val_mae: 3.1778
Epoch 184/200
323/323 - 0s - loss: 15.8186 - mae: 

In [15]:
loss,mae=model.evaluate(x_test,y_test)

print('mae=', mae)

mae= 3.5364566


In [16]:
all_price_sum = y_train.sum()+y_test.sum()
Number_of_prices = y_train.size+y_test.size  #number of all labels

print("Average of prices of all the data = ",all_price_sum/Number_of_prices)

Average of prices of all the data =  22.53280632411067


# The average of price is 22,000 while our mean absolute error is around 3 to 4.5k