## AirBnB Optimal Price Predictor

This notebook downloads the data from remote used in this project and then cleans it. Ready to be used to build a model for predictions.


In [1]:
# Imports
import pandas as pd
from sklearn.model_selection import train_test_split
import category_encoders as ce
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense
from tensorflow.keras import backend as K
from sklearn.pipeline import make_pipeline
import numpy as np

In [2]:
import sklearn
import tensorflow
print('Pandas version: ',pd.__version__)
print('sklearn version: ',sklearn.__version__)
print('category_encoders version: ',ce.__version__)
print('tensorflow version: ',tensorflow.__version__)
print('numpy version: ',np.__version__)

# pipenv installation string
#  pip install pandas==1.1.5 scikit-learn==0.22.2 category_encoders==2.2.2 tensorflow==2.5.0

"""
Output from above should match the following in order to run the following code successfully:-

Pandas version:  1.1.5
sklearn version:  0.22.2
category_encoders version:  2.2.2
tensorflow version:  2.5.0
numpy version:  1.18.5
"""

Pandas version:  1.1.5
sklearn version:  0.22.2
category_encoders version:  2.2.2
tensorflow version:  2.5.0
numpy version:  1.18.5


'\nOutput from above should match the following in order to run the following code successfully:-\n\nPandas version:  1.1.5\nsklearn version:  0.22.2\ncategory_encoders version:  2.2.2\ntensorflow version:  2.5.0\nnumpy version:  1.18.5\n'

In [3]:
df = pd.read_csv('../data/raw/merged_df.csv')

df

Unnamed: 0,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,calculated_host_listings_count,availability_365,filename
0,Lordelo do Ouro e Massarelos,41.15010,-8.66035,Entire home/apt,75.0,5.0,17.0,1.0,0.0,portugal_porto_2020-10-21.csv
1,Mafamude e Vilar do Paraíso,41.10739,-8.59430,Entire home/apt,73.0,30.0,39.0,3.0,1.0,portugal_porto_2020-10-21.csv
2,Cete,41.17481,-8.35362,Entire home/apt,66.0,3.0,14.0,3.0,0.0,portugal_porto_2020-10-21.csv
3,Cete,41.17449,-8.35426,Entire home/apt,66.0,3.0,8.0,3.0,0.0,portugal_porto_2020-10-21.csv
4,"Cedofeita, Ildefonso, Sé, Miragaia, Nicolau, V...",41.14918,-8.60922,Private room,20.0,2.0,136.0,6.0,231.0,portugal_porto_2020-10-21.csv
...,...,...,...,...,...,...,...,...,...,...
12155789,Hennepin,45.01486,-93.30547,Private room,40.0,1.0,3.0,1.0,362.0,united-states_twin-cities-msa_2020-05-12.csv
12155790,Hennepin,45.00723,-93.29911,Private room,50.0,2.0,0.0,1.0,86.0,united-states_twin-cities-msa_2020-05-12.csv
12155791,Ramsey,45.08582,-93.02009,Entire home/apt,159.0,5.0,0.0,2.0,87.0,united-states_twin-cities-msa_2020-05-12.csv
12155792,Scott,44.74567,-93.39000,Entire home/apt,300.0,5.0,0.0,1.0,356.0,united-states_twin-cities-msa_2020-05-12.csv


In [4]:
df.columns

Index(['neighbourhood', 'latitude', 'longitude', 'room_type', 'price',
       'minimum_nights', 'number_of_reviews', 'calculated_host_listings_count',
       'availability_365', 'filename'],
      dtype='object')

In [5]:
# Check out for any null values
df.isnull().sum()
# No null values

neighbourhood                     0
latitude                          0
longitude                         0
room_type                         0
price                             0
minimum_nights                    0
number_of_reviews                 0
calculated_host_listings_count    0
availability_365                  0
filename                          0
dtype: int64

In [6]:
# Drop unnecessary columns and
# Rearrange columns in X, y format
df = df[['latitude', 'longitude', 'neighbourhood', 'room_type',
       'minimum_nights', 'number_of_reviews', 'calculated_host_listings_count',
       'availability_365', 'price']]

In [7]:
df.describe(include='all')

Unnamed: 0,latitude,longitude,neighbourhood,room_type,minimum_nights,number_of_reviews,calculated_host_listings_count,availability_365,price
count,12155790.0,12155790.0,12155794,12155794,12155790.0,12155790.0,12155790.0,12155790.0,12155790.0
unique,,,7044,4,,,,,
top,,,I Centro Storico,Entire home/apt,,,,,
freq,,,151180,8419984,,,,,
mean,30.02005,0.5946818,,,6.55203,24.74396,13.78731,172.2744,914.1725
std,27.62039,74.90316,,,23.81185,52.41428,51.68367,146.0285,9022.593
min,-46.90624,-176.2681,,,1.0,0.0,1.0,0.0,10.0
25%,31.21871,-58.40575,,,1.0,0.0,1.0,1.0,60.0
50%,40.41108,2.56503,,,2.0,4.0,2.0,168.0,108.0
75%,45.54059,18.2975,,,4.0,23.0,6.0,340.0,251.0


In [8]:
# Split in X and y

X = df[['latitude', 'longitude', 'neighbourhood', 'room_type',
       'minimum_nights', 'number_of_reviews', 'calculated_host_listings_count',
       'availability_365']]
y = df[['price']]

X.shape, y.shape

((12155794, 8), (12155794, 1))

In [9]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.15, shuffle=True)

# See the shape
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((10332424, 8), (1823370, 8), (10332424, 1), (1823370, 1))

In [10]:
# OneHot Encode data
def ohe_transform(X_train, X_test):
       """
       OneHotEncoder transformer for X_train, X_test data categorical information
       :param X_train:
       :param X_test:
       :return: X_train_ohe, X_test_ohe"""
       ohe = ce.OrdinalEncoder()
       ohe.fit(X_train)
       X_train_ohe = ohe.transform(X_train)
       X_test_ohe = ohe.transform(X_test)

       return X_train_ohe, X_test_ohe

X_train_ohe, X_test_ohe = ohe_transform(X_train, X_test)
X_train.shape, X_test.shape

((10332424, 8), (1823370, 8))

In [11]:
# # Ordinal Encode Data
"""
def transform_categorical_data(X_train, X_test):
       oe = OrdinalEncoder()
       oe.fit(X_train)
       X_train_oe = oe.transform(X_train)
       X_test_oe = oe.transform(X_test)
       return X_train_oe, X_test_oe

# Run the function to have encoded X_train and X_test
X_train_oe, X_test_oe = transform_categorical_data(X_train, X_test)"""

'\ndef transform_categorical_data(X_train, X_test):\n       oe = OrdinalEncoder()\n       oe.fit(X_train)\n       X_train_oe = oe.transform(X_train)\n       X_test_oe = oe.transform(X_test)\n       return X_train_oe, X_test_oe\n\n# Run the function to have encoded X_train and X_test\nX_train_oe, X_test_oe = transform_categorical_data(X_train, X_test)'

In [12]:
# Export DataFrame data to numpy values
X_train_np = X_train_ohe.values
X_test_np = X_test_ohe.values

In [13]:
# Defining vanilla Feedforward neural network
model = Sequential()
model.add(Dense(10, input_dim=X_train_np.shape[1], activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))

# Output layer
model.add(Dense(1))

# Comile
model.compile(loss='mean_absolute_error', optimizer='adam', metrics=['mean_absolute_error'])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 10)                90        
_________________________________________________________________
dense_1 (Dense)              (None, 256)               2816      
_________________________________________________________________
dense_2 (Dense)              (None, 128)               32896     
_________________________________________________________________
dense_3 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_4 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 33        
Total params: 46,171
Trainable params: 46,171
Non-trainable params: 0
____________________________________________________

In [14]:
# Fit the model
model.fit(X_train_np, y_train, batch_size=128,epochs=4, verbose=1, validation_data=(X_test_np, y_test))

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<tensorflow.python.keras.callbacks.History at 0x7fc1476a9940>

In [15]:
# Get predicted prices and compare with the actual data
predicted_prices = model.predict(X_test_ohe)
predicted_prices

array([[ 37.249405],
       [584.1244  ],
       [ 68.76402 ],
       ...,
       [138.3226  ],
       [ 51.466778],
       [ 89.57875 ]], dtype=float32)

In [16]:
# Compare with the following actual data
y_test

Unnamed: 0,price
7683666,25.0
2633928,310.0
146861,51.0
6446737,112.0
10720873,16.0
...,...
12006374,80.0
10899517,61.0
1299064,101.0
9074683,34.0


### In following code we are going to export the model for production

In [17]:
X_test_ohe.head(1)

Unnamed: 0,latitude,longitude,neighbourhood,room_type,minimum_nights,number_of_reviews,calculated_host_listings_count,availability_365
7683666,51.53543,0.04419,376.0,1,2.0,6.0,2.0,0.0


In [18]:
# Checkout a prediction from the model
test_prediction = {'latitude': -38.25482, 'longitude': 144.50328, 'neighbourhood': 376, 'room_type': 1, 'minimum_nights': 2, 'number_of_reviews': 6,
       'calculated_host_listings_count': 2, 'availability_365': 0}
test_prediction_df = pd.DataFrame([test_prediction])

model.predict(test_prediction_df)

array([[52.70908]], dtype=float32)

In [26]:
# Save Model
model.save('../models/airbnbpredict_all_data.h5')

In [27]:
# Test by loading and predicting

# load model
test_model = load_model('../models/airbnbpredict_all_data.h5')

# Model Summary
test_model.summary()

# Checkout a prediction from the model
test_prediction = {'latitude': -38.25482, 'longitude': 144.50328, 'neighbourhood': 376, 'room_type': 1, 'minimum_nights': 2, 'number_of_reviews': 6,
       'calculated_host_listings_count': 2, 'availability_365': 0}
test_prediction_df = pd.DataFrame([test_prediction])

# Predict to check
test_model.predict(test_prediction_df)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_4 (Dense)              (None, 10)                90        
_________________________________________________________________
dense_5 (Dense)              (None, 256)               2816      
_________________________________________________________________
dense_6 (Dense)              (None, 128)               32896     
_________________________________________________________________
dense_7 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_8 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_9 (Dense)              (None, 1)                 33        
Total params: 46,171
Trainable params: 46,171
Non-trainable params: 0
__________________________________________________

array([[68.955025]], dtype=float32)