Predicting house prices using Image and Data

In [19]:
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras import Model, Input
from keras.layers import Conv2D, MaxPooling2D
from keras.layers import Activation, Dense, Flatten, Dropout
from keras.layers import BatchNormalization
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import glob
import cv2
import os
import locale

In [20]:
!git clone https://github.com/emanhamed/Houses-dataset.git

fatal: destination path 'Houses-dataset' already exists and is not an empty directory.


In [21]:
inputPath = "C:\\Users\\Sepehr\\ML_file\\Deep-Learning\\BDI\\Houses-dataset\\Houses_Dataset\\HousesInfo.txt"
datasetPath = "C:\\Users\\Sepehr\\ML_file\\Deep-Learning\\BDI\\Houses-dataset\\Houses_Dataset"

cols = ["bedrooms", "bathrooms", "area", "zipcode", "price"]
df = pd.read_csv(inputPath, sep=" ", header=None, names=cols)


zipcodes, counts = np.unique(df["zipcode"], return_counts=True)

# loop over each of the unique zip codes and their corresponding
# count
for (zipcode, count) in zip(zipcodes, counts):
    # the zip code counts for our housing dataset is *extremely*
    # unbalanced (some only having 1 or 2 houses per zip code)
    # so let's sanitize our data by removing any houses with less
    # than 25 houses per zip code
    if count < 25:
        idxs = df[df["zipcode"] == zipcode].index
        df.drop(idxs, inplace=True)

In [22]:
print(count)

4


In [23]:
# initialize our images array (i.e., the house images themselves)
images = []

# loop over the indexes of the houses
for i in df.index.values:
    # find the four images for the house and sort the file paths,
    # ensuring the four are always in the *same order*
    basePath = os.path.sep.join([datasetPath, "{}_*".format(i + 1)])
    housePaths = sorted(list(glob.glob(basePath)))
    # initialize our list of input images along with the output image
    # after *combining* the four input images
    inputImages = []
    outputImage = np.zeros((64, 64, 3), dtype="uint8")

    # loop over the input house paths
    for housePath in housePaths:
        # load the input image, resize it to be 32 32, and then
        # update the list of input images
        image = cv2.imread(housePath)
        image = cv2.resize(image, (32, 32))
        inputImages.append(image)

    # tile the four input images in the output image such the first
    # image goes in the top-right corner, the second image in the
    # top-left corner, the third image in the bottom-right corner,
    # and the final image in the bottom-left corner
    outputImage[0:32, 0:32] = inputImages[0]
    outputImage[0:32, 32:64] = inputImages[1]
    outputImage[32:64, 32:64] = inputImages[2]
    outputImage[32:64, 0:32] = inputImages[3]

    # add the tiled image to our set of images the network will be
    # trained on
    images.append(outputImage)
images = np.array(images)

In [24]:
images = images / 255.0

In [39]:
image

array([[[153, 168, 184],
        [158, 175, 191],
        [155, 173, 190],
        ...,
        [ 92,  99, 116],
        [140, 153, 169],
        [132, 144, 160]],

       [[160, 176, 193],
        [159, 176, 194],
        [156, 173, 194],
        ...,
        [ 29,  50,  93],
        [137, 149, 168],
        [139, 152, 169]],

       [[162, 178, 196],
        [160, 176, 195],
        [157, 174, 197],
        ...,
        [ 80, 118, 173],
        [137, 151, 169],
        [142, 158, 176]],

       ...,

       [[161, 175, 193],
        [161, 165, 177],
        [148, 162, 180],
        ...,
        [ 17,  50, 116],
        [167, 173, 185],
        [193, 194, 208]],

       [[159, 173, 191],
        [151, 166, 182],
        [146, 158, 177],
        ...,
        [ 40,  50,  83],
        [153, 160, 177],
        [195, 194, 206]],

       [[157, 172, 188],
        [151, 166, 182],
        [161, 174, 190],
        ...,
        [ 19,  28,  41],
        [ 85,  92, 148],
        [193, 187, 198]]

In [25]:
# partition the data into training and testing splits using 75% of
# the data for training and the remaining 25% for testing
split = train_test_split(df, images, test_size=0.25, random_state=42)
(trainAttrX, testAttrX, trainImagesX, testImagesX) = split

In [26]:
# find the largest house price in the training set and use it to
# scale our house prices to the range [0, 1] (will lead to better
# training and convergence)
maxPrice = trainAttrX["price"].max()
trainY = trainAttrX["price"] / maxPrice
testY = testAttrX["price"] / maxPrice

In [27]:
# initialize the column names of the continuous data
continuous = ["bedrooms", "bathrooms", "area"]

# performin min-max scaling each continuous feature column to
# the range [0, 1]
cs = MinMaxScaler()
trainContinuous = cs.fit_transform(trainAttrX[continuous])
testContinuous = cs.transform(testAttrX[continuous])

# one-hot encode the zip code categorical data (by definition of
# one-hot encoing, all output features are now in the range [0, 1])
zipBinarizer = LabelBinarizer().fit(df["zipcode"])
trainCategorical = zipBinarizer.transform(trainAttrX["zipcode"])
testCategorical = zipBinarizer.transform(testAttrX["zipcode"])

# construct our training and testing data points by concatenating
# the categorical features with the continuous features
trainAttrX = np.hstack([trainCategorical, trainContinuous])
testAttrX = np.hstack([testCategorical, testContinuous])

In [28]:
dim = trainAttrX.shape[1] #10

# define our MLP network
inp = keras.Input(shape=(dim,))
x = Dense(16, activation="relu")(inp)
x = Dense(7, activation="relu")(x)
out = Dense(4, activation="relu")(x)

model = keras.Model(inp, out)

In [29]:


width, height, depth = 64, 64, 3
filters=(16, 32, 64)
# initialize the input shape and channel dimension, assuming
# TensorFlow/channels-last ordering
inputShape = (height, width, depth)

# define the model input
inputs = Input(shape=inputShape)

# loop over the number of filters
for (i, f) in enumerate(filters):
    # if this is the first CONV layer then set the input
    # appropriately
    if i == 0:
        x = inputs

    # CONV => RELU => BN => POOL
    x = Conv2D(f, (3, 3), padding="same")(x)
    x = Activation("relu")(x)
    x = BatchNormalization()(x)
    x = MaxPooling2D(pool_size=(2, 2))(x)

# flatten the volume, then FC => RELU => BN => DROPOUT
x = Flatten()(x)
x = Dense(16)(x)
x = Activation("relu")(x)
x = BatchNormalization()(x)
x = Dropout(0.5)(x)

# apply another FC layer, this one to match the number of nodes
# coming out of the MLP
x = Dense(4)(x)
x = Activation("relu")(x)

# construct the CNN
cnn = Model(inputs, x)

In [30]:
from keras.layers import concatenate
# create the input to our final set of layers as the *output* of both
# the MLP and CNN
combinedInput = concatenate([model.output, cnn.output])

# our final FC layer head will have two dense layers, the final one
# being our regression head
x = Dense(4, activation="relu")(combinedInput)
x = Dense(1, activation="linear")(x)

model = Model([model.input, cnn.input], x)

In [31]:
model.compile(loss="mean_absolute_percentage_error", optimizer='adam')

In [32]:
keras.utils.plot_model(model, show_shapes=True)


You must install pydot (`pip install pydot`) for `plot_model` to work.


In [33]:
# train the model
from  keras import callbacks

reduce_lr = callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1,
                              patience=35)

model.fit(
    [trainAttrX, trainImagesX], trainY,
    validation_data=([testAttrX, testImagesX], testY),
    epochs=200, batch_size=8, callbacks=[reduce_lr])

Epoch 1/200




[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step - loss: 1142.7137



[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 46ms/step - loss: 1135.7834 - val_loss: 82.5456 - learning_rate: 0.0010
Epoch 2/200
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 30ms/step - loss: 411.3386 - val_loss: 69.8695 - learning_rate: 0.0010
Epoch 3/200
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 31ms/step - loss: 317.1809 - val_loss: 64.1474 - learning_rate: 0.0010
Epoch 4/200
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 36ms/step - loss: 260.6678 - val_loss: 60.1161 - learning_rate: 0.0010
Epoch 5/200
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 32ms/step - loss: 209.5428 - val_loss: 58.5568 - learning_rate: 0.0010
Epoch 6/200
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 30ms/step - loss: 216.3258 - val_loss: 97.3168 - learning_rate: 0.0010
Epoch 7/200
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 38ms/step - loss: 157.2444 - val_loss: 240.3195 - le

<keras.src.callbacks.history.History at 0x13b2b9b6720>

In [34]:
# make predictions on the testing data
preds = model.predict([testAttrX, testImagesX])

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 276ms/step


In [36]:
# compute the difference between the *predicted* house prices and the
# *actual* house prices, then compute the percentage difference and
# the absolute percentage difference
diff = preds.flatten() - testY
percentDiff = (diff / testY) * 100
absPercentDiff = np.abs(percentDiff)

# compute the mean and standard deviation of the absolute percentage
# difference
mean = np.mean(absPercentDiff)
std = np.std(absPercentDiff)

# finally, show some statistics on our model
locale.setlocale(locale.LC_ALL, "en_US.UTF-8")
print("[INFO] avg. house price: {}, std house price: {}".format(
    locale.currency(df["price"].mean(), grouping=True),
    locale.currency(df["price"].std(), grouping=True)))
print("[INFO] mean: {:.2f}%, std: {:.2f}%".format(mean, std))

[INFO] avg. house price: $533,388.27, std house price: $493,403.08
[INFO] mean: 20.44%, std: 17.79%


In [41]:
model.save('C:\\Users\\Sepehr\\ML_file\\Deep-Learning\\BDI\\Houses-dataset\\myModel.keras')