In [None]:
import tensorflow as tf
print(tf.__version__)
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2" # suppress info and warning messages
import tensorflow.keras as keras
import math
import time
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

2.18.0


In [None]:
# since we use our own dataset, our importing process is a bit different. We downloaded our data set and ran the code cells on our local Jupyter Notebook
dataset_dir = '/Users/sophiacherkaoui/Downloads/living_room_dataset'

In [None]:
# load in the dataset from a folder "living_room_dataset", where each subdirectory is named [label] and contains a subset of images with that label
rentals = keras.utils.image_dataset_from_directory(dataset_dir, labels='inferred', class_names=["Deluxe", "Luxury", "Standard"], seed=1234)
dataset_size = len(rentals)

# split the data into 90% training, 10% test sets
train_size = int(0.9 * dataset_size)
train_ds = rentals.take(train_size)
test_ds = rentals.skip(train_size)

NotFoundError: Could not find directory /Users/sophiacherkaoui/Downloads/living_room_dataset

In [None]:
# prints four random examples from dataset
for i in range(1, 5):
    img_tensor, label = next(iter(rentals.unbatch()))
    img_np = img_tensor.numpy()
    plt.imshow(img_np.astype("uint8"))
    plt.title(rentals.class_names[label])
    plt.axis(False)
    plt.show()

In [None]:
# print shape of one example
img_np.shape

In [None]:
# function to convert training and test datasets to numpy arrays. each item in train_ds, test_ds is a tuple in the form (image_batch, label_batch)
def to_numpy(dataset):
    images = []
    labels = []

    for image_batch, label_batch in dataset:
        # convert examples to numpy arrays and add them to lists
        images.append(image_batch.numpy())
        labels.append(label_batch.numpy())

    # merge all examples and labels into their respective sets
    X = np.concatenate(images, axis=0)
    y = np.concatenate(labels, axis=0)

    return X, y

In [None]:
# call function on training and test batches

X_train, y_train = to_numpy(train_ds)
X_test, y_test = to_numpy(test_ds)

In [None]:
# normalize pixel values
X_train = X_train / 255.0
X_test = X_test / 255.0

# verify values were normalized (all values should be floats in range [0.0, 1.0])
X_train[0]
X_test[0]

In [None]:
# inspect training and test sets
print("X_train: " + str(X_train.shape))
print("X_test: " + str(X_test.shape))
print("y_train: " + str(y_train.shape))
print("y_test: " + str(y_test.shape))

In [None]:
# verify shape of one example
# correct format: (pixel_height, pixel_width, num_color_channels)
# color channels = 3 since the images are RGB
X_train[0].shape

In [None]:
# building the model
cnn_model = keras.Sequential()

input_shape = X_train[0].shape
input_layer = keras.layers.InputLayer(input_shape)
cnn_model.add(input_layer)

# create first hidden layer
conv_1 = keras.layers.Conv2D(16, 3)
batchNorm_1 = keras.layers.BatchNormalization()
ReLU_1 = keras.layers.ReLU()
cnn_model.add(conv_1)
cnn_model.add(batchNorm_1)
cnn_model.add(ReLU_1)

# create second hidden layer
conv_2 = keras.layers.Conv2D(32, 3)
batchNorm_2 = keras.layers.BatchNormalization()
ReLU_2 = keras.layers.ReLU()
cnn_model.add(conv_2)
cnn_model.add(batchNorm_2)
cnn_model.add(ReLU_2)

# add pooling layer
pooling_layer = keras.layers.GlobalAveragePooling2D()
cnn_model.add(pooling_layer)

output_layer = keras.layers.Dense(units=5)
cnn_model.add(output_layer)

cnn_model.summary()

In [None]:
# specify a stochastic gradient descent optimizer with an appropriate learning rate
sgd_optimizer = keras.optimizers.SGD(learning_rate=0.1)

In [None]:
# specify the loss function
loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True)

In [None]:
# compile the model
cnn_model.compile(optimizer=sgd_optimizer, loss=loss_fn, metrics=['accuracy'])

In [None]:
# fit the model
num_epochs = 1
t0 = time.time() # start time
history = cnn_model.fit(X_train, y_train, epochs=num_epochs)
t1 = time.time() # stop time

print('Elapsed time: %.2fs' % (t1-t0))

In [None]:
# evaluate and print the model's performance
loss, accuracy = cnn_model.evaluate(X_test, y_test)

print('Loss: ', str(loss) , 'Accuracy: ', str(accuracy))

## Part 2: Defining ML Problem

Questions:
1. List the data set you have chosen.
2. What will you be predicting? What is the label?
3. Is this a supervised or unsupervised learning problem? Is this a clustering, classification or regression problem? Is it a binary classificaiton or multi-class classifiction problem?
4. What are your features? (note: this list may change after your explore your data)
5. Explain why this is an important problem. In other words, how would a company create value with a model that predicts this label?

## The Answers

1. The data set we chose is the airbnbListings data set. However, due to the nature of our project, we decided to do augment this data set with images scraped from the Airbnb website. We utilized scrapers from the following repositories to obtain images and prices:

https://github.com/johnbalvin/pyairbnb

https://github.com/airbert-vln/bnb-dataset/tree/main

However, we made some edits to these scripts when necessary to fit our desired dataset size. We also created some of our own scripts to further assist in the data organization. These can be found on our [Github](https://github.com/soph743/cnn_pricing_model).

2. We are predicting the price range, and therefore tier (standard, deluxe or luxury) of Airbnb listings depending on images of the listing's bedroom.

3. This is a supervised learning problem because we predefined our labels as standard, deluxe or luxury. The nightly pricing thresholds are as followed:

- Standard: $137 or less

- Deluxe: $138 - $240

- Luxury: $241+

This problem is also a multi-classification problem because we grouped each bedroom image into one of the three categories above.

4. Since this is an image classication problem, our features are the actual images which the model was trained on. As the Convoluted Neural Network learns on these images, it uses certain patterns it detects in pixel value to classify the test data.

5. This is an important problem because Airbnb has over 8 million listings worldwide, humans cannot possibly sift through all of these listings in a timely manner, and also provide quality service for helping guests find their next stay. With an automated system like this, we greatly increase the efficiency of searching and filtering for hosts and customers. This is also an important problem because it improves the service of hosts and therefore customer satisfaction. For instance, this model could help new hosts improve their listings with visual feedback, and we could even add a feature where we provide a checklist for hosts to address. For example, “To be considered ‘Deluxe’, consider adding X, Y, Z.” A company could create value with a model that predicts this label tier for several reasons. One, auto-mated quality assessment. For instance, instead of relying solely on host-provided descriptions (which are often fabricated for marketing purposes we researched), Airbnb could automatically assess the visual quality of a listing's interior. Airbnb could even go one step further to suggest approrpiate pricing tiers, flag any listings that don’t match their claimed category and therefore improve consistency and trust with guests/consumers on the platform. Moreover, in terms of scaling our project, we could plan potential integration into recommendation engines; Such as filtering listings by visual appeal & tier without requiring structured input data (maybe expand to Zillow, Redfin, etc.). Also, we could provide property managers/hosts with visual feedback to guide renovation or staging decisions aligned with target pricing tiers. Our model can also be integrated with a front-end application where users can attach images of their rental properties, and recieve a suggested price range. Such an application can be utilized by Airbnb or other vacation rental sites to help hosts determine appropriate pricing for their properties.

## Part 4: Define Your Project Plan



1.  Do you have a new feature list? If so, what are the features that you chose to keep and remove after inspecting the data?
2. Explain different data preparation techniques that you will use to prepare your data for modeling.
3. What is your model (or models)?
4. Describe your plan to train your model, analyze its performance and then improve the model. That is, describe your model building, validation and selection plan to produce a model that generalizes well to new data.

## The Answers

1. Yes, we chose to omit all features in the Airbnb listing dataset, as stated previously. Our images act as the features which our CNN is trained on.
2. The different data preparation techniques we used to prepare our data for modeling included:
- Running scripts to extract, organize, and resize our images.
- Scraping information on nightly price for each listing, which we then used to determine the ranges for our three classes: Standard, Deluxe and Luxury.
- Choosing a subset of images with similar characteristics to allow for better model predictions (in our case, we chose only bedroom images).
- Normalizing pixel data in Jupyter Notebook.
- Addressing class imbalance in our data sample to promote fair AI in a couple of ways. 1, by making the price cut offs in a way that puts a close amount of images in each category (standard, deluxe or luxury). 2, we didn't try to arrange the images by our own standards, rather the images were selected randomly using an automated scraper tool.
3. Our model is a convolutional neural network (CNN) that classifies Airbnb properties into 3 price tiers (standard, deluxe, luxury), based solely on 85 uploaded images that extract visual features. We used a small dataset to speed up the training process; our intention is to scale up the model with larger sets of images in order to improve model accuracy.
4. Due to the small size of data used, paired with the complexity of our problem (predicting price of a rental property based off of bedroom photos), we do not expect a high accuracy. When we initially trained the model, we had an accuracy of 57% and a loss of 7. This high loss shows the model's inability to learn patterns in the data adequately, which is what we predicted due to the reasons discussed previously.

However, we were able to minimize the loss by implementing a few changes:
- including an input layer, 2 hidden layers, a pooling layer and output layer: due to the small dataset used, extra layers are not necessary and can lead to overfitting.
- setting the learning rate to 0.1: we found that decreasing the learning rate increased the loss slightly and did not improve accuracy.
- setting the filter size in the hidden convolution layers to (16, 3) and (32, 3): we found that altering the filter sizes to be higher/lower in each layer led to a decrease in accuracy.

Our changes resulted in a model with 57% accuracy and loss of 1.3. These statistics show that our model is still poorly suited to generalization on new data, as the CNN has not properly learning to classify the images. Below are some steps we can take to improve model accuracy:

1. Increase the size of the data set.
2. Select one city/zip code to pull data from: since we used a set of 12 cities in the US, this may have added extra complexity to the model. By sticking to one city, the model may be able to learn pricing patterns better.
3. Add non-image features: extra features such as square footage may provide useful additional information to improve predictions.
4. Address class imbalance: our "luxury" category had the least amount of examples, as it was the category with the highest prices. We can add more images in this category.
5. Use data augmentation: by changing the orientation of the images, we can prevent overfitting.
6. Implement more techniques learned in class such as K-fold cross-validation, L2 regularization and dropout.






Our main motive behind choosing this project is was to challenge ourselves. It is beyond the scope of the requirements of this project, yet we followed our ambition and intellectual curiousity to try something different by building a convolutional neural network. We have a vision to continue to this work and even implement a front-end part. We want to incorporate a lightweight Flask interface for an accessible, browser-based experience which grants a clean, intuitive output for non-technical users.