In [1]:
# Import image data
# Import numerical data
# Hybrid model - image and numerical data, predict price
# Try InceptionV3 model

In [2]:
import tensorflow as tf
from tensorflow.keras.applications.inception_v3 import InceptionV3
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras import layers
from tensorflow.keras import Model
from tensorflow.keras.layers import Dense

import pandas as pd
import os
import numpy as np

import matplotlib.pyplot as plt

tf.config.run_functions_eagerly(True)

In [3]:
print(tf.__version__)

2.5.0


## Notes


Priority:

- Use full dataset
- CNN model (custom defined)
- Trim off outliers (price more than a particular threshold)
- Scale down target column or scale down loss (custom loss)
- Hybrid model - Definitely do this
- Check out data generator parameters

Others:

- Learning rate
- Change activation function to maybe linear?
- Optimizers
- Unfreezing some of the lower layers
- More epochs - upto 50?
- Fewer units in dense layer
- Higher batch size (8,16)

In [5]:
# Load the training and validation dataset and labels

train_label_df = pd.read_excel("dataset_splits/cleaned_train_data.xlsx")
test_label_df = pd.read_excel("dataset_splits/cleaned_test_data.xlsx")

In [6]:
train_label_df.columns[:30]

Index(['pictureid', 'bathroomcnt', 'bedroomcnt', 'calculatedbathnbr',
       'calculatedfinishedsquarefeet', 'fullbathcnt', 'garagecarcnt',
       'garagetotalsqft', 'latitude', 'longitude', 'lotsizesquarefeet',
       'roomcnt', 'yearbuilt', 'numberofstories', 'taxvaluedollarcnt',
       'assessmentyear', 'propertycountylandusecode_1',
       'propertycountylandusecode_34', 'propertycountylandusecode_38',
       'propertycountylandusecode_96', 'propertycountylandusecode_122',
       'propertycountylandusecode_135', 'propertylandusetypeid_246',
       'propertylandusetypeid_247', 'propertylandusetypeid_248',
       'propertylandusetypeid_261', 'propertylandusetypeid_266',
       'regionidcity_5465.0', 'regionidcity_6285.0', 'regionidcity_8384.0'],
      dtype='object')

In [7]:
train_label_df.head()

Unnamed: 0,pictureid,bathroomcnt,bedroomcnt,calculatedbathnbr,calculatedfinishedsquarefeet,fullbathcnt,garagecarcnt,garagetotalsqft,latitude,longitude,...,censustractandblock_60590200000000,censustractandblock_60590300000000,censustractandblock_60590400000000,censustractandblock_60590500000000,censustractandblock_60590600000000,censustractandblock_60590700000000,censustractandblock_60590800000000,censustractandblock_60590900000000,censustractandblock_60591000000000,censustractandblock_60591100000000
0,0,5.5,5,5.5,5042,5,4,988,33509223,-117672518,...,0,0,1,0,0,0,0,0,0,0
1,1,1.5,3,1.5,1695,1,1,360,33820017,-117971956,...,0,0,0,0,0,0,0,1,0,0
2,2,2.0,4,2.0,1460,2,2,456,33677514,-117954806,...,0,0,0,0,0,0,0,0,1,0
3,3,1.0,2,1.0,1296,1,1,288,33744721,-117863804,...,0,0,0,0,0,0,1,0,0,0
4,4,2.0,3,2.0,1397,2,1,360,33756060,-117875860,...,0,0,0,0,0,0,1,0,0,0


In [9]:
train_label_df["target_scaled_down"] = train_label_df["taxvaluedollarcnt"]/1e5
test_label_df["target_scaled_down"] = test_label_df["taxvaluedollarcnt"]/1e5

In [10]:
train_label_df["target_scaled_down"].max()

216.44655

In [11]:
# Pick threshold to cut off outliers

threshold_value_saved = np.percentile(np.array(train_label_df["target_scaled_down"]), 99)
print(threshold_value_saved)

26.647402400000004


In [10]:
# 10,000,000

In [12]:
# Removing outliers
train_label_df = train_label_df.loc[train_label_df["target_scaled_down"] < threshold_value_saved]
test_label_df = test_label_df.loc[test_label_df["target_scaled_down"] < threshold_value_saved]

In [13]:
print(f"Train df: {len(train_label_df)}, test df: {len(test_label_df)}")

Train df: 14849, test df: 4955


In [14]:
train_label_df.shape

(14849, 170)

In [15]:
y_training = np.array(train_label_df["target_scaled_down"])

In [16]:
X_training_df = train_label_df.drop(columns = ["pictureid", "taxvaluedollarcnt", "target_scaled_down"], axis = 1, errors='ignore')

In [17]:
y_training.shape

(14849,)

In [17]:
#list(X_training_df.columns)

In [18]:
X_training_df.shape

(14849, 167)

In [19]:
# Scaling the dataframe
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_training_df.loc[:, ['bathroomcnt',
 'bedroomcnt',
 'calculatedbathnbr',
 'calculatedfinishedsquarefeet',
 'fullbathcnt',
 'garagecarcnt',
 'garagetotalsqft',
 'latitude',
 'longitude',
 'lotsizesquarefeet',
 'roomcnt',
 'yearbuilt',
 'numberofstories',
 'assessmentyear']])

StandardScaler()

In [20]:
scaler.mean_

array([ 2.40767055e+00,  3.47228770e+00,  2.40767055e+00,  2.00388255e+03,
        2.24001616e+00,  1.89655869e+00,  4.74237996e+02,  3.37280968e+07,
       -1.17851247e+08,  7.36668355e+03,  5.20796013e+00,  1.97171028e+03,
        1.29678766e+00,  2.01599993e+03])

In [21]:
non_one_hot_array = scaler.transform(X_training_df.loc[:, ['bathroomcnt',
 'bedroomcnt',
 'calculatedbathnbr',
 'calculatedfinishedsquarefeet',
 'fullbathcnt',
 'garagecarcnt',
 'garagetotalsqft',
 'latitude',
 'longitude',
 'lotsizesquarefeet',
 'roomcnt',
 'yearbuilt',
 'numberofstories',
 'assessmentyear']])

In [22]:
X_training = np.array(X_training_df)

In [23]:
X_training.shape

(14849, 167)

In [24]:
one_hot_array = X_training[:,14:]

In [25]:
one_hot_array.shape

(14849, 153)

In [26]:
non_one_hot_array.shape

(14849, 14)

In [27]:
X_training_final = np.concatenate((non_one_hot_array, one_hot_array), axis=1)

In [28]:
X_training_final.shape

(14849, 167)

In [29]:
X_training_df[:10]

Unnamed: 0,bathroomcnt,bedroomcnt,calculatedbathnbr,calculatedfinishedsquarefeet,fullbathcnt,garagecarcnt,garagetotalsqft,latitude,longitude,lotsizesquarefeet,...,censustractandblock_60590200000000,censustractandblock_60590300000000,censustractandblock_60590400000000,censustractandblock_60590500000000,censustractandblock_60590600000000,censustractandblock_60590700000000,censustractandblock_60590800000000,censustractandblock_60590900000000,censustractandblock_60591000000000,censustractandblock_60591100000000
0,5.5,5,5.5,5042,5,4,988,33509223,-117672518,29899,...,0,0,1,0,0,0,0,0,0,0
1,1.5,3,1.5,1695,1,1,360,33820017,-117971956,7568,...,0,0,0,0,0,0,0,1,0,0
2,2.0,4,2.0,1460,2,2,456,33677514,-117954806,6120,...,0,0,0,0,0,0,0,0,1,0
3,1.0,2,1.0,1296,1,1,288,33744721,-117863804,6750,...,0,0,0,0,0,0,1,0,0,0
4,2.0,3,2.0,1397,2,1,360,33756060,-117875860,6911,...,0,0,0,0,0,0,1,0,0,0
5,2.0,3,2.0,1604,2,2,430,33798257,-117826688,6120,...,0,0,0,0,0,0,1,0,0,0
6,3.0,3,3.0,1299,3,2,440,33646644,-117593253,4000,...,0,1,0,0,0,0,0,0,0,0
7,3.0,3,3.0,2191,3,2,484,33613380,-117916796,2640,...,0,0,0,0,1,0,0,0,0,0
8,2.0,3,2.0,1059,2,2,460,33630450,-117948241,2480,...,0,0,0,0,1,0,0,0,0,0
9,3.0,5,3.0,2358,3,2,493,33611755,-117676417,8750,...,0,1,0,0,0,0,0,0,0,0


In [30]:
pd.DataFrame(X_training_final)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,157,158,159,160,161,162,163,164,165,166
0,3.814660,1.733492,3.814660,3.569415,3.553726,4.068480,3.428188,-1.794520,1.357159,3.101725,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-1.119691,-0.535904,-1.119691,-0.362899,-1.596632,-1.734126,-0.762278,0.753643,-0.916594,0.027713,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,-0.502897,0.598794,-0.502897,-0.638995,-0.309042,0.200076,-0.121697,-0.414722,-0.786367,-0.171614,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,-1.736485,-1.670602,-1.736485,-0.831675,-1.596632,-1.734126,-1.242713,0.136300,-0.095353,-0.084891,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,-0.502897,-0.535904,-0.502897,-0.713013,-0.309042,-1.734126,-0.762278,0.229267,-0.186899,-0.062728,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14844,0.113896,1.733492,0.113896,0.437192,-0.309042,0.200076,-0.228460,-0.453028,0.243860,-0.100996,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
14845,-0.502897,-0.535904,-0.502897,-0.302980,-0.309042,0.200076,0.105175,0.088139,-1.099314,-0.153719,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
14846,0.730690,0.598794,0.730690,-0.302980,0.978547,-1.734126,-0.642169,0.442257,-0.437602,-0.022532,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
14847,0.113896,-0.535904,0.113896,-0.207815,-0.309042,0.200076,-0.275169,1.614107,-0.076764,-0.624643,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Make validation dataset as well

In [31]:
y_testing = np.array(test_label_df["target_scaled_down"])
X_testing_df = test_label_df.drop(columns = ["pictureid", "taxvaluedollarcnt", "target_scaled_down"], axis = 1, errors='ignore')

non_one_hot_array_testing = scaler.transform(X_testing_df.loc[:, ['bathroomcnt',
 'bedroomcnt',
 'calculatedbathnbr',
 'calculatedfinishedsquarefeet',
 'fullbathcnt',
 'garagecarcnt',
 'garagetotalsqft',
 'latitude',
 'longitude',
 'lotsizesquarefeet',
 'roomcnt',
 'yearbuilt',
 'numberofstories',
 'assessmentyear']])

X_testing = np.array(X_testing_df)
one_hot_array_testing = X_testing[:,14:]

X_testing_final = np.concatenate((non_one_hot_array_testing, one_hot_array_testing), axis=1)

## Hybrid Model

In [32]:
import cv2

In [33]:
train_label_df["pictureid"][:10]

  train_label_df["pictureid"][:10]


0     0
1     1
2     2
3     3
4     4
5     5
6     6
7     8
8    10
9    12
Name: pictureid, dtype: int64

In [34]:
#os.listdir("dataset_splits/sub_splits/all_sub_pics/train_sub_pics/")
pic_names_in_order = list(train_label_df["pictureid"].apply(lambda x: f"dataset_splits/all_data_pics/train_data_pics/pic_{x}.png").values)

In [35]:
pic_names_in_order[:5]

['dataset_splits/all_data_pics/train_data_pics/pic_0.png',
 'dataset_splits/all_data_pics/train_data_pics/pic_1.png',
 'dataset_splits/all_data_pics/train_data_pics/pic_2.png',
 'dataset_splits/all_data_pics/train_data_pics/pic_3.png',
 'dataset_splits/all_data_pics/train_data_pics/pic_4.png']

In [36]:
len(y_training)

14849

In [37]:
from tqdm import tqdm

In [38]:
IMG_WIDTH = 400
IMG_HEIGHT = 400
img_folder = "dataset_splits/all_data_pics/train_data_pics/"

def create_dataset(img_folder, pic_names_in_order):
   
    img_data_array=[]
    
    for filename in tqdm(pic_names_in_order):
       
        image= cv2.imread( filename, cv2.COLOR_BGR2RGB)
        try:
            image=cv2.resize(image, (IMG_HEIGHT, IMG_WIDTH),interpolation = cv2.INTER_AREA)
        except:
            break
        image=np.array(image)
        image = image.astype('float32')
        image /= 255 
        img_data_array.append(image)
        
    return img_data_array

In [39]:
img_data_array = create_dataset(img_folder, pic_names_in_order)

100%|██████████| 14849/14849 [01:10<00:00, 209.62it/s]


In [40]:
img_data_array[0].shape

(400, 400, 3)

In [41]:
len(img_data_array)

14849

In [None]:
img_data_array_final = np.array(img_data_array)

In [None]:
# Testing data
pic_names_in_order_testing = list(test_label_df["pictureid"].apply(lambda x: f"dataset_splits/all_test_pics/test_data_pics/pic_{x}.png").values)
img_data_array_testing = create_dataset(img_folder = "dataset_splits/all_test_pics/test_data_pics/", pic_names_in_order = pic_names_in_order_testing)
img_data_array_final_testing = np.array(img_data_array_testing)

## Finally train the model

In [None]:
from tensorflow import keras
from tensorflow.keras import Model, Input


In [None]:
from tensorflow.keras.layers import Conv2D, MaxPool2D, Dense, Flatten, concatenate


In [None]:
input_image = Input(shape=(400, 400, 3))
input_features = Input(shape=(159,))

# apply convolutional layers to image branch
x = Conv2D(32, 3)(input_image)
x = Conv2D(32, 3)(x)
x = MaxPool2D(2)(x)
x = Flatten()(x)

# concatenate flattened image branch with input features
concat = concatenate([x, input_features])

# apply dense layers on concatenated data
dense = Dense(64)(concat)
output = Dense(64)(dense)


model = Model(inputs=[input_image, input_features], outputs=output)


In [None]:
model.compile(loss="mse", optimizer="adam", metrics=["mse", "mae"])

 To dos:
1. Reshape the inputs into the format of (image_features, input_features, labels) to match the size in training step
2. Not able to use data generator anymore


In [None]:
# # To load previous checkpoint

# checkpoint_path = "CNN_checkpoints/v1.0.0/checkpoint.hdf5"

# # Load model from checkpoint
# try: 
#     print('** trying to load latest checkpoint')
#     model.load_weights( checkpoint_path )
#     print('** loaded latest checkpoint')
# except OSError:
#     print('** loading model from scratch')

In [None]:
checkpoint_dir = "hybrid_checkpoints/v2.0.0"

if not os.path.exists(checkpoint_dir):
    os.mkdir(checkpoint_dir)

In [None]:
history = model.fit(
        [img_data_array_final ,X_training_final],  
        y_training,
        epochs = 15,
        batch_size = 4,
        steps_per_epoch = 250,
        validation_data = [img_data_array_final_testing ,X_testing_final],
        callbacks=[
            tf.keras.callbacks.ModelCheckpoint( 
                filepath = f'{checkpoint_dir}/checkpoint.hdf5',
                save_weights_only = True, 
                verbose = 1, 
                save_freq = 3000, # Save every 2000 batches/steps
            ),
            #tf.keras.callbacks.EarlyStopping( monitor='loss', patience=3 ),
        ],
        verbose=1
         )

In [None]:
history_df = pd.DataFrame(history.history) 

In [None]:
# Plot utility
def plot_graphs(history_object_df, string):
  plt.plot(history_object_df[string])

  if "val_loss" in history_object_df.columns:
    plt.plot(history_object_df['val_'+string])
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.legend([string, 'val_'+string])
    plt.show()

  else:
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.legend([string, 'val_'+string])
    plt.show()

In [None]:
# # adam 50 epochs, 4 batch size
plot_graphs(history_df, "loss")

## Compiling Hybrid model

In [None]:
# import tensorflow as tf
# from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dropout, Dense, Input
# from tensorflow.keras.models import Sequential

# training = True

In [None]:
# # Hybrid model

# # Numerical Layers

# num_data_input = Input(shape=(159,))
# hidden_1 = Dense(500, activation='relu')(num_data_input)
# hidden_2 = Dense(1000, activation='relu')(hidden_1)
# num_flatten = Flatten()(hidden_2)


# # Image Layers

# # Input
# image_data_input = Input(shape = (400, 400, 3))

# # Convolution 1
# conv1 = tf.keras.layers.Conv2D(64,(3,3), activation="relu", input_shape=(400,400, 3))(image_data_input)
# pool1 = tf.keras.layers.MaxPooling2D(2,2)(conv1)

# # Convolution 2
# conv2 = tf.keras.layers.Conv2D(64,(3,3), activation="relu")(pool1)
# pool2 = tf.keras.layers.MaxPooling2D(2,2)(conv2)

# # Convolution 3
# conv3 = tf.keras.layers.Conv2D(128, (3,3), activation='relu')(pool2)
# pool3 = tf.keras.layers.MaxPooling2D(2,2)(conv3)

# # Convolution 3
# conv4 = tf.keras.layers.Conv2D(128, (3,3), activation='relu')(pool3)
# pool4 = tf.keras.layers.MaxPooling2D(2,2)(conv4)

# # Flatten the results to feed into a DNN
# image_layer = tf.keras.layers.Flatten()(pool4)

# # Dropout to avoid overfitting
# if training:
#     image_layer = tf.keras.layers.Dropout(0.5)(image_layer)


# # Add numerical data
# hybrid = tf.concat([image_layer, num_flatten], axis=-1)
# # Normalize
# hybrid = tf.keras.layers.LayerNormalization(epsilon=1e-6)(hybrid)


# # Final linear regression layer
# # 512 neuron hidden layer
# output = tf.keras.layers.Dense(512, activation='relu')(hybrid)

# model = tf.keras.Model(inputs=[image_data_input, num_data_input], outputs=[output])

In [None]:
# model.compile(loss="mse", optimizer="adam", metrics=["mse", "mae"])

In [None]:
# checkpoint_dir = "hybrid_checkpoints/v1.0.0"

# if not os.path.exists(checkpoint_dir):
#     os.mkdir(checkpoint_dir)

In [None]:
# # Train
# history = model.fit(
#     [train_generator, X_training_final],
#     y_training,
#     validation_data = validation_generator,
#     epochs=10,
#     steps_per_epoch=250,
#     callbacks=[
#         tf.keras.callbacks.ModelCheckpoint( 
#             filepath = f'{checkpoint_dir}/checkpoint.hdf5',
#             save_weights_only = True, 
#             verbose = 1, 
#             save_freq = 2000, # Save every 2000 batches/steps
#         ),
#         #tf.keras.callbacks.EarlyStopping( monitor='loss', patience=3 ),
#     ],
#     verbose=1
# )