## AirBnB Optimal Price Predictor

This notebook downloads the data from remote used in this project and then cleans it. Ready to be used to build a model for predictions.


In [1]:
# Imports
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
import category_encoders as ce
import numpy as np
import requests
import glob
import os
import httplib2

In [2]:
df = pd.read_csv('../data/raw/australia_visualisations_listings.csv.csv')

df

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,number_of_reviews_ltm,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,29786588,Bestview Cottage,224083235,Carmela,New South Wales,Orange,-33.33346,149.03763,Entire home/apt,190,1,81,21,2021-01-29,3.00,1,167
1,45318800,"Lakeside House- 23 Acres, 10 minutes from Orange!",366315077,Harry & Danielle,New South Wales,Orange,-33.33337,149.04834,Entire home/apt,1000,2,14,14,2021-01-21,3.62,1,29
2,22151808,Bright and Cosy Room in Orange.,78371092,Diana,New South Wales,Orange,-33.28470,149.10878,Private room,68,2,5,0,2018-10-06,0.14,4,274
3,22518202,HG 105A Sale St. STYLISH IN THE HEART OF ORANGE,89994665,"Marianne, Terry & Richard",New South Wales,Orange,-33.27709,149.09787,Entire home/apt,166,1,158,59,2021-01-25,4.48,2,320
4,21871881,"""BLUE & WHITE NILE"" Work, Rest or Play. Walk CBD",78345518,Kristin,New South Wales,Orange,-33.28375,149.11322,Entire home/apt,290,2,86,23,2021-01-26,2.26,1,92
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
153909,47383518,2 bedroom apartment in the heart of Dickson,37476631,Adam,Australian Capital Territory,Unincorporated ACT,-35.25055,149.13621,Entire home/apt,135,2,0,0,,,1,155
153910,25961546,Pialligo Vines - A Country Estate,19759369,Helen & Terry,Australian Capital Territory,Unincorporated ACT,-35.31129,149.18094,Entire home/apt,195,2,72,39,2021-01-25,2.38,1,172
153911,31335541,A spacious room with a spectacular view,234587680,Helen,Australian Capital Territory,Unincorporated ACT,-35.18414,149.11046,Private room,68,1,9,4,2020-12-11,0.39,2,84
153912,29217916,Bourkey's room one,219560800,Terry,Australian Capital Territory,Unincorporated ACT,-35.15848,149.09918,Private room,40,1,97,18,2021-01-25,3.48,3,68


In [3]:
# Check out for any null values
df.isnull().sum()
# No null values

id                                    0
name                                 20
host_id                               0
host_name                            78
neighbourhood_group                   0
neighbourhood                         0
latitude                              0
longitude                             0
room_type                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
number_of_reviews_ltm                 0
last_review                       25878
reviews_per_month                 25878
calculated_host_listings_count        0
availability_365                      0
dtype: int64

In [4]:
# Drop unnecessary columns and
# Rearrange columns in X, y format
df = df[['latitude', 'longitude', 'room_type',
       'minimum_nights', 'number_of_reviews', 'calculated_host_listings_count',
       'availability_365', 'price']]

In [5]:
df.describe(include='all')

Unnamed: 0,latitude,longitude,room_type,minimum_nights,number_of_reviews,calculated_host_listings_count,availability_365,price
count,153914.0,153914.0,153914,153914.0,153914.0,153914.0,153914.0,153914.0
unique,,,4,,,,,
top,,,Entire home/apt,,,,,
freq,,,114630,,,,,
mean,-33.257543,146.183371,,3.925848,29.800902,15.511221,164.495264,255.932709
std,5.074834,9.664016,,21.398849,53.670211,51.409165,140.028747,509.293129
min,-43.58005,96.83131,,1.0,0.0,1.0,0.0,0.0
25%,-37.572468,144.96547,,1.0,1.0,1.0,1.0,99.0
50%,-33.88687,150.378895,,2.0,8.0,2.0,155.0,163.0
75%,-31.884408,151.28735,,3.0,34.0,5.0,313.0,286.0


In [6]:
# Split in X and y

X = df[['latitude', 'longitude', 'room_type',
       'minimum_nights', 'number_of_reviews', 'calculated_host_listings_count',
       'availability_365']]
y = df[['price']]

X.shape, y.shape

((153914, 7), (153914, 1))

In [7]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.15, shuffle=True)

# See the shape
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((130826, 7), (23088, 7), (130826, 1), (23088, 1))

In [8]:
X_train.columns

Index(['latitude', 'longitude', 'room_type', 'minimum_nights',
       'number_of_reviews', 'calculated_host_listings_count',
       'availability_365'],
      dtype='object')

In [9]:
# One Hot Encode room_type data


In [10]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 130826 entries, 153717 to 121958
Data columns (total 7 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   latitude                        130826 non-null  float64
 1   longitude                       130826 non-null  float64
 2   room_type                       130826 non-null  object 
 3   minimum_nights                  130826 non-null  int64  
 4   number_of_reviews               130826 non-null  int64  
 5   calculated_host_listings_count  130826 non-null  int64  
 6   availability_365                130826 non-null  int64  
dtypes: float64(2), int64(4), object(1)
memory usage: 8.0+ MB


In [15]:
# TODO: Investigate why this is not working
"""
# OneHot Encode data
def ohe_transform(X_train, X_test):
       """
     """  OneHotEncoder transformer for X_train, X_test data categorical information
       :param X_train:
       :param X_test:
       :return: X_train_ohe, X_test_ohe"""
       """
       ohe = OrdinalEncoder()
       ohe.fit(X_train)
       X_train_ohe = X_train.transform(X_train)
       X_test_ohe = X_train.transform(X_test)

       return X_train_ohe, X_test_ohe

X_train_ohe, X_test_ohe = ohe_transform(X_train, X_test)


# prepare input data in quantitative data
def transform_categorical_data(X_train, X_test):
       oe = OrdinalEncoder()
       oe.fit(X_train)
       X_train_oe = oe.transform(X_train)
       X_test_oe = oe.transform(X_test)
       return X_train_oe, X_test_oe

# Run the function to have encoded X_train and X_test
X_train, X_test = transform_categorical_data(X_train, X_test)

"""

AttributeError: 'latitude' is not a valid function for 'Series' object

In [23]:
from sklearn.pipeline import make_pipeline
pipeline = make_pipeline(
    ce.OneHotEncoder(use_cat_names=True)
)

#fit on train, score on val
pipeline.fit(X_train, y_train)
# print('Val accuracy', pipeline.score(X_val, y_val))

#before encoding
X_train.shape

#after encoding
encoder = pipeline.named_steps['onehotencoder']
X_train_ohe = encoder.transform(X_train)
#
X_train_ohe.head()

  elif pd.api.types.is_categorical(cols):


Unnamed: 0,latitude,longitude,room_type_Entire home/apt,room_type_Private room,room_type_Shared room,room_type_Hotel room,minimum_nights,number_of_reviews,calculated_host_listings_count,availability_365
153717,-43.03536,147.94319,1,0,0,0,1,95,3,45
44065,-33.79219,151.28553,0,1,0,0,2,43,2,364
127012,-26.69103,153.11247,1,0,0,0,2,104,1,233
122899,-28.55597,153.50109,0,1,0,0,1,1,1,0
46182,-32.4356,115.76014,1,0,0,0,2,70,2,287


In [24]:
# Transform X_test using OHE
X_test_ohe = encoder.transform(X_test)

In [28]:
# Export DataFrame data to numpy values
X_train_np = X_train_ohe.values
X_test_np = X_test_ohe.values

In [36]:
# Imports
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras import backend as K

def coeff_determination(y_true, y_pred):
    SS_res =  K.sum(K.square( y_true-y_pred ))
    SS_tot = K.sum(K.square( y_true - K.mean(y_true) ) )
    return ( 1 - SS_res/(SS_tot + K.epsilon()) )

# Feed Forward neural network
model = Sequential()
model.add(Dense(10, input_dim=X_train_np.shape[1], activation='relu'))
model.add(Dense(128, activation='relu'))

model.add(Dense(1, activation='linear'))

# model.compile(loss='mean_absolute_error', optimizer='adam', metrics=['accuracy' ,'mean_absolute_error'])
model.compile(loss='mean_absolute_error', optimizer='adam', metrics=[coeff_determination])

model.summary()


Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_9 (Dense)              (None, 10)                110       
_________________________________________________________________
dense_10 (Dense)             (None, 128)               1408      
_________________________________________________________________
dense_11 (Dense)             (None, 1)                 129       
Total params: 1,647
Trainable params: 1,647
Non-trainable params: 0
_________________________________________________________________


In [37]:
model.fit(X_train_np, y_train, batch_size=128,epochs=5, verbose=1, validation_data=(X_test_ohe, y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7fc0495fa7f0>

In [38]:
predicted_prices = model.predict(X_test_ohe)
predicted_prices

array([[200.13185],
       [170.14134],
       [209.03625],
       ...,
       [183.9169 ],
       [185.89641],
       [225.83519]], dtype=float32)

In [39]:
y_test

Unnamed: 0,price
104563,750
37231,106
95178,383
105091,99
105096,169
...,...
101361,883
126107,50
88415,280
69227,180
