# Machine Learning Applications for Airbnb Data

### Group 3 - Dhruv Shah, Jenn Hong, Setu Shah, Sonya Dreyer

---



• State the problem

• Tell us who cares about this problem and Why

• Describe your data – where it came from, what it contains

• Present some interesting descriptive analyses (plots/tables) that motivates your exercise

• Present your main results

• Which methods worked best for your problem?

• What were the challenges you faced? Tell us about the biggest challenge you faced and how you
overcame it (or, tried but did not – that’s fine too – not every problem has a solution.)

• Conclude – what have you learnt that can be put to practice?

# Data Cleaning

---



In [None]:
# Import preprocessing libraries

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Download the file

!wget 'https://maven-datasets.s3.amazonaws.com/Airbnb/Airbnb+Data.zip'

--2023-11-18 20:29:59--  https://maven-datasets.s3.amazonaws.com/Airbnb/Airbnb+Data.zip
Resolving maven-datasets.s3.amazonaws.com (maven-datasets.s3.amazonaws.com)... 52.216.161.147, 52.217.168.145, 54.231.170.121, ...
Connecting to maven-datasets.s3.amazonaws.com (maven-datasets.s3.amazonaws.com)|52.216.161.147|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 91005234 (87M) [application/zip]
Saving to: ‘Airbnb+Data.zip’


2023-11-18 20:30:02 (44.4 MB/s) - ‘Airbnb+Data.zip’ saved [91005234/91005234]



In [None]:
# Unzip the file

!unzip Airbnb+Data.zip

Archive:  Airbnb+Data.zip
   creating: Airbnb Data/
  inflating: Airbnb Data/Listings.csv  
  inflating: Airbnb Data/Listings_data_dictionary.csv  
  inflating: Airbnb Data/Reviews.csv  
  inflating: Airbnb Data/Reviews_data_dictionary.csv  


In [None]:
# Load the data frames

listings =  pd.read_csv('/content/Airbnb Data/Listings.csv', encoding = 'latin1', low_memory = False)

reviews = pd.read_csv('/content/Airbnb Data/Reviews.csv', encoding = 'latin1', low_memory = False)

In [46]:
listings.isnull().sum()

listing_id                          0
name                              173
host_id                             0
host_since                        165
host_location                     840
host_response_time             128782
host_response_rate             128782
host_acceptance_rate           113087
host_is_superhost                   0
host_total_listings_count         165
host_has_profile_pic                0
host_identity_verified              0
neighbourhood                       0
district                       242700
city                                0
latitude                            0
longitude                           0
property_type                       0
room_type                           0
accommodates                        0
bedrooms                        29435
amenities                           0
minimum_nights                      0
maximum_nights                      0
review_scores_rating            91405
review_scores_accuracy          91713
review_score

In [47]:
listings.isnull().sum() / len(listings) *100

listing_id                      0.000000
name                            0.061849
host_id                         0.000000
host_since                      0.058989
host_location                   0.300309
host_response_time             46.040928
host_response_rate             46.040928
host_acceptance_rate           40.429799
host_is_superhost               0.000000
host_total_listings_count       0.058989
host_has_profile_pic            0.000000
host_identity_verified          0.000000
neighbourhood                   0.000000
district                       86.767818
city                            0.000000
latitude                        0.000000
longitude                       0.000000
property_type                   0.000000
room_type                       0.000000
accommodates                    0.000000
bedrooms                       10.523324
amenities                       0.000000
minimum_nights                  0.000000
maximum_nights                  0.000000
review_scores_ra

In [None]:
listings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 279712 entries, 0 to 279711
Data columns (total 33 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   listing_id                   279712 non-null  int64  
 1   name                         279539 non-null  object 
 2   host_id                      279712 non-null  int64  
 3   host_since                   279547 non-null  object 
 4   host_location                278872 non-null  object 
 5   host_response_time           150930 non-null  object 
 6   host_response_rate           150930 non-null  float64
 7   host_acceptance_rate         166625 non-null  float64
 8   host_is_superhost            279547 non-null  object 
 9   host_total_listings_count    279547 non-null  float64
 10  host_has_profile_pic         279547 non-null  object 
 11  host_identity_verified       279547 non-null  object 
 12  neighbourhood                279712 non-null  object 
 13 

In [None]:
# Converting to datetime

listings.host_since = pd.to_datetime(listings.host_since)

In [None]:
# Converting to out-of-10 scale

listings.review_scores_rating = listings.review_scores_rating / 10

In [None]:
# Converting prices to USD

cities = listings['city'].unique()
exchange_rates = [1.0808, 1, 0.028388, 0.20328, 0.65462, 0.039480, 1.0808, 0.12777, 0.0493, 0.053215] # update these numbers before fitting models
currency_map = dict(zip(cities, exchange_rates))

listings['usd_price'] = listings.apply(lambda row: row['price'] * currency_map[row['city']], axis=1) # create new column
listings.drop('price', axis = 1, inplace = True) # drop original column

In [None]:
# Converting to numerical category

# Potentially problematic -> Converting NULL values to zero

listings.host_is_superhost = listings.host_is_superhost.apply(lambda x: 1 if x == 't' else 0)
listings.host_has_profile_pic = listings.host_has_profile_pic.apply(lambda x: 1 if x == 't' else 0)
listings.host_identity_verified = listings.host_identity_verified.apply(lambda x: 1 if x == 't' else 0)
listings.instant_bookable = listings.instant_bookable.apply(lambda x: 1 if x == 't' else 0)

In [None]:
# We can or should drop listing_id, host_id, property, neighbourhood

# We can drop Districts as it has only districts of New York, rest are all NULL

# We should drop name and possibly host_location (unless we want to/can figure out how to extract precise location --> latitude and longitude can be used to create clusters like in the lab)

# All host locations within each country have been mapped to the most prominent city in that country

# We need to possibly impute values (or drop columns) for host response time/rate, host_acceptance_rate, and some of the ratings columns [Iterative Imputer]

In [None]:
listings.head(2)

Unnamed: 0,listing_id,name,host_id,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_total_listings_count,...,maximum_nights,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,usd_price
0,281420,"Beautiful Flat in le Village Montmartre, Paris",1466919,2011-12-03,"Paris, Ile-de-France, France",,,,0,1.0,...,1125,10.0,10.0,10.0,10.0,10.0,10.0,10.0,0,57.2824
1,3705183,39 mÃÂ² Paris (Sacre CÃâur),10328771,2013-11-29,"Paris, Ile-de-France, France",,,,0,1.0,...,1125,10.0,10.0,10.0,10.0,10.0,10.0,10.0,0,129.696


In [None]:
listings.neighbourhood.value_counts()

I Centro Storico     14874
Sydney                8074
Copacabana            7712
Cuauhtemoc            7626
Buttes-Montmartre     7237
                     ...  
Lighthouse Hill          1
Willowbrook              1
Magalhaes Bastos         1
Woodrow                  1
Agua Santa               1
Name: neighbourhood, Length: 660, dtype: int64

# Preprocessing

---



In [None]:
# Jenn's one hot encoding for amenities

In [None]:
listings['amenities'].value_counts()

["Long term stays allowed"]                                                                                                                                                                                                                                                                                                                                                               1388
["Long term stays allowed", "Iron", "Air conditioning", "Wifi", "Kitchen"]                                                                                                                                                                                                                                                                                                                 376
[]                                                                                                                                                                                                                                        

In [None]:
# amentity_column.value_counts()

In [None]:
list_of_amenities = []
amentity_column = listings['amenities'].apply(eval)

for sublist in amentity_column:
  for item in sublist:
      list_of_amenities.append(item)

pd.Series(list_of_amenities).value_counts()

Wifi                                                                          260090
Essentials                                                                    253532
Long term stays allowed                                                       241054
Kitchen                                                                       240923
TV                                                                            213037
                                                                               ...  
Fridgedare stainless steel gas stove                                               1
Frigedare stainless steel oven                                                     1
Fridgedare Stainless Steel refrigerator                                            1
HDTV with Amazon Prime Video, Apple TV, Chromecast, HBO Max, Netflix, Roku         1
Gautier Bluetooth sound system                                                     1
Length: 3446, dtype: int64

In [None]:
ordered_amenities = pd.Series(list_of_amenities).value_counts()

In [None]:
ordered_amenities = ordered_amenities[ordered_amenities > 100000]
ordered_amenities

Wifi                       260090
Essentials                 253532
Long term stays allowed    241054
Kitchen                    240923
TV                         213037
Hangers                    211356
Hair dryer                 188724
Iron                       187756
Washer                     185073
Heating                    184327
Dedicated workspace        179267
Shampoo                    174082
Hot water                  165163
Smoke alarm                156467
Air conditioning           142693
Dishes and silverware      123394
Refrigerator               123259
Cooking basics             110255
Elevator                   101582
Bed linens                 100486
Microwave                  100470
dtype: int64

In [None]:
ordered_amenities = ordered_amenities.reset_index(name='counts')

In [None]:
list_of_amenities = ordered_amenities['index'].to_list()

21

In [None]:
#making columns of nulls with column names from the amenity list
from numpy import NaN

for col in list_of_amenities:
  listings[col] = NaN

#creating lists of 1 and 0s if the amentity is mentioned in the appropriate columns
for amenity in list_of_amenities:
  listings[amenity] = listings['amenities'].apply(lambda x: 1 if amenity in x else 0)

In [48]:
listings.columns

Index(['listing_id', 'name', 'host_id', 'host_since', 'host_location',
       'host_response_time', 'host_response_rate', 'host_acceptance_rate',
       'host_is_superhost', 'host_total_listings_count',
       'host_has_profile_pic', 'host_identity_verified', 'neighbourhood',
       'district', 'city', 'latitude', 'longitude', 'property_type',
       'room_type', 'accommodates', 'bedrooms', 'amenities', 'minimum_nights',
       'maximum_nights', 'review_scores_rating', 'review_scores_accuracy',
       'review_scores_cleanliness', 'review_scores_checkin',
       'review_scores_communication', 'review_scores_location',
       'review_scores_value', 'instant_bookable', 'usd_price', 'Wifi',
       'Essentials', 'Long term stays allowed', 'Kitchen', 'TV', 'Hangers',
       'Hair dryer', 'Iron', 'Washer', 'Heating', 'Dedicated workspace',
       'Shampoo', 'Hot water', 'Smoke alarm', 'Air conditioning',
       'Dishes and silverware', 'Refrigerator', 'Cooking basics', 'Elevator',
       'Bed

In [None]:
#creating a list of unique amenities mentioned
# list_of_amenities = []
# amentity_column = listings['amenities'].apply(eval)

# for sublist in amentity_column:
#   for item in sublist:
#     if item not in list_of_amenities:
#       list_of_amenities.append(item)

# list_of_amenities
# #making columns of nulls with column names from the amenity list
# from numpy import NaN

# for col in list_of_amenities:
#   listings[col] = NaN

# #creating lists of 1 and 0s if the amentity is mentioned in the appropriate columns
# for amenity in list_of_amenities:
#   listings[amenity] = listings['amenities'].apply(lambda x: 1 if amenity in x else 0)

In [None]:
##########################################################################

In [65]:
training_df = listings[['host_is_superhost', 'host_has_profile_pic', 'host_identity_verified', 'city', 'instant_bookable', 'host_total_listings_count', 'accommodates', 'bedrooms', 'minimum_nights', 'maximum_nights', 'usd_price']]
training_df = training_df.dropna()

In [59]:
training_df['usd_price']

0          57.2824
1         129.6960
2          96.1912
3          62.6864
4          64.8480
            ...   
279707    129.6960
279708     64.8480
279709     54.0400
279710    113.4840
279711     75.6560
Name: usd_price, Length: 250132, dtype: float64

In [66]:
# Splitting the data into training and test sets to estimate generalization error

from sklearn.model_selection import train_test_split

X = training_df.drop("usd_price", axis=1)
y = training_df["usd_price"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((200105, 10), (50027, 10), (200105,), (50027,))

In [67]:
# Building preprocessing pipeline

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

from sklearn import set_config
set_config(display='diagram')

cat_attribs = ['host_is_superhost', 'host_has_profile_pic', 'host_identity_verified', 'city', 'instant_bookable'] # not sure if host_since (maybe split by months) is included here

num_attribs = ['host_total_listings_count', 'accommodates', 'bedrooms', 'minimum_nights',
               'maximum_nights'] # excluding latitude and longitude

preprocess_pipeline = ColumnTransformer([
        ("cat", OneHotEncoder(drop="first"), cat_attribs),
        ("num", StandardScaler(), num_attribs),
    ])

preprocess_pipeline

In [63]:
# preprocess_pipeline.fit_transform(X_train, y_train)

array([[ 0.        ,  1.        ,  0.        , ..., -0.4473252 ,
        -0.20709383, -0.00294582],
       [ 0.        ,  1.        ,  0.        , ...,  0.41949327,
        -0.14781722, -0.00314125],
       [ 0.        ,  1.        ,  1.        , ..., -0.4473252 ,
         0.05965089, -0.0031556 ],
       ...,
       [ 0.        ,  1.        ,  1.        , ..., -0.4473252 ,
        -0.20709383, -0.0031573 ],
       [ 0.        ,  1.        ,  1.        , ...,  1.28631174,
        -0.20709383, -0.00294582],
       [ 0.        ,  1.        ,  0.        , ...,  0.41949327,
        -0.11817892, -0.00315522]])

In [68]:
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline

lr_pipeline = Pipeline([
    ('preprocessor', preprocess_pipeline),
    ('model', LinearRegression())
])

lr_pipeline.fit(X_train, y_train)

In [72]:
from sklearn.metrics import mean_squared_error

y_pred = lr_pipeline.predict(X_test)
mean_squared_error(y_test, y_pred, squared=False)

388.50873336824054

In [None]:
#RandomForestRegressor

In [78]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline

rfr_pipeline = Pipeline([
    ('preprocessor', preprocess_pipeline),
    ('model', RandomForestRegressor())
])

rfr_pipeline.fit(X_train, y_train)

In [80]:
from sklearn.metrics import mean_squared_error

y_pred = rfr_pipeline.predict(X_test)
mean_squared_error(y_test, y_pred, squared=False)

362.0797710870456

In [None]:
#Hyperparameter tuning

In [83]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

#Setting up and checking the parameter grid

param_distribs = [
    {'model__max_depth': randint(low=1, high=11),
     'model__min_samples_leaf': randint(low=1, high=16)}
]

random_search = RandomizedSearchCV(rfr_pipeline, param_distribs, n_iter=5, cv=3, scoring='neg_root_mean_squared_error', random_state=42)
random_search.fit(X_train, y_train)
random_cv_res = pd.DataFrame(random_search.cv_results_).sort_values(by='mean_test_score', ascending=False).head()[['param_model__max_depth', 'param_model__min_samples_leaf', 'mean_test_score']]

In [84]:
random_cv_res

Unnamed: 0,param_model__max_depth,param_model__min_samples_leaf,mean_test_score
1,8,13,-415.63046
4,7,11,-415.775061
2,5,7,-417.161266
0,7,4,-419.590191
3,10,3,-425.656191
