# Machine Learning Applications for Airbnb Data

### Group 3 - Dhruv Shah, Jenn Hong, Setu Shah, Sonya Dreyer

---



• State the problem

• Tell us who cares about this problem and Why

• Describe your data – where it came from, what it contains

• Present some interesting descriptive analyses (plots/tables) that motivates your exercise

• Present your main results

• Which methods worked best for your problem?

• What were the challenges you faced? Tell us about the biggest challenge you faced and how you
overcame it (or, tried but did not – that’s fine too – not every problem has a solution.)

• Conclude – what have you learnt that can be put to practice?

# Data Cleaning

---



In [1]:
# Import preprocessing libraries

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# Download the file

!wget 'https://maven-datasets.s3.amazonaws.com/Airbnb/Airbnb+Data.zip'

--2023-12-03 01:20:04--  https://maven-datasets.s3.amazonaws.com/Airbnb/Airbnb+Data.zip
Resolving maven-datasets.s3.amazonaws.com (maven-datasets.s3.amazonaws.com)... 52.217.164.193, 16.182.33.89, 54.231.230.169, ...
Connecting to maven-datasets.s3.amazonaws.com (maven-datasets.s3.amazonaws.com)|52.217.164.193|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 91005234 (87M) [application/zip]
Saving to: ‘Airbnb+Data.zip’


2023-12-03 01:20:06 (40.5 MB/s) - ‘Airbnb+Data.zip’ saved [91005234/91005234]



In [3]:
# Unzip the file

!unzip Airbnb+Data.zip

Archive:  Airbnb+Data.zip
   creating: Airbnb Data/
  inflating: Airbnb Data/Listings.csv  
  inflating: Airbnb Data/Listings_data_dictionary.csv  
  inflating: Airbnb Data/Reviews.csv  
  inflating: Airbnb Data/Reviews_data_dictionary.csv  


In [4]:
# Load the data frames

listings =  pd.read_csv('/content/Airbnb Data/Listings.csv', encoding = 'latin1', low_memory = False)

#reviews = pd.read_csv('/content/Airbnb Data/Reviews.csv', encoding = 'latin1', low_memory = False)

In [5]:
listings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 279712 entries, 0 to 279711
Data columns (total 33 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   listing_id                   279712 non-null  int64  
 1   name                         279539 non-null  object 
 2   host_id                      279712 non-null  int64  
 3   host_since                   279547 non-null  object 
 4   host_location                278872 non-null  object 
 5   host_response_time           150930 non-null  object 
 6   host_response_rate           150930 non-null  float64
 7   host_acceptance_rate         166625 non-null  float64
 8   host_is_superhost            279547 non-null  object 
 9   host_total_listings_count    279547 non-null  float64
 10  host_has_profile_pic         279547 non-null  object 
 11  host_identity_verified       279547 non-null  object 
 12  neighbourhood                279712 non-null  object 
 13 

In [6]:
# Converting to datetime

listings.host_since = pd.to_datetime(listings.host_since)

In [7]:
# Converting to out-of-10 scale

listings.review_scores_rating = listings.review_scores_rating / 10

In [8]:
# Converting prices to USD

cities = listings['city'].unique()
exchange_rates = [1.0808, 1, 0.028388, 0.20328, 0.65462, 0.039480, 1.0808, 0.12777, 0.0493, 0.053215] # update these numbers before fitting models
currency_map = dict(zip(cities, exchange_rates))

listings['usd_price'] = listings.apply(lambda row: row['price'] * currency_map[row['city']], axis=1) # create new column
listings.drop('price', axis = 1, inplace = True) # drop original column

In [9]:
# Converting to numerical category

# Potentially problematic -> Converting NULL values to zero

listings.host_is_superhost = listings.host_is_superhost.apply(lambda x: 1 if x == 't' else 0)
listings.host_has_profile_pic = listings.host_has_profile_pic.apply(lambda x: 1 if x == 't' else 0)
listings.host_identity_verified = listings.host_identity_verified.apply(lambda x: 1 if x == 't' else 0)
listings.instant_bookable = listings.instant_bookable.apply(lambda x: 1 if x == 't' else 0)

In [10]:
# We can or should drop listing_id, host_id, property, neighbourhood

# We can drop Districts as it has only districts of New York, rest are all NULL

# We should drop name and possibly host_location (unless we want to/can figure out how to extract precise location --> latitude and longitude can be used to create clusters like in the lab)

# All host locations within each country have been mapped to the most prominent city in that country

# We need to possibly impute values (or drop columns) for host response time/rate, host_acceptance_rate, and some of the ratings columns [Iterative Imputer]

In [11]:
# Dropping hopeless columns

columns_to_drop = ['listing_id', 'host_id', 'property_type', 'neighbourhood', 'district', 'property_type','name','host_location','amenities','longitude','latitude']

listings = listings.drop(columns=columns_to_drop, axis=1)

In [12]:
# Dropping columns with > 50% missing values

missing_values_columns = ['host_response_time', 'host_response_rate', 'host_acceptance_rate']

listings = listings.drop(columns=missing_values_columns, axis=1)

# Preprocessing

---



In [13]:
# Splitting the data into training and test sets to estimate generalization error

from sklearn.model_selection import train_test_split

X = listings.drop("usd_price", axis=1)
y = listings["usd_price"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((223769, 19), (55943, 19), (223769,), (55943,))

In [14]:
# # Iteratively impute missing values for numerical columns

# X_train_num = X_train.select_dtypes(include=[np.number])

# # explicitly require this experimental feature
# from sklearn.experimental import enable_iterative_imputer  # noqa

# # now you can import normally from sklearn.impute
# from sklearn.impute import IterativeImputer

# iter_imputer = IterativeImputer(random_state=42)
# X_train_imp = iter_imputer.fit_transform(X_train_num)
# X_train_imp_df = pd.DataFrame(X_train_imp, columns=X_train_num.columns, index=X_train_num.index)

In [15]:
# Building preprocessing pipeline

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer


from sklearn import set_config
set_config(display='diagram')

cat_attribs = ['host_is_superhost', 'host_has_profile_pic', 'host_identity_verified', 'city', 'room_type', 'instant_bookable'] # not sure if host_since (maybe split by months) is included here

num_attribs = ['host_total_listings_count', 'accommodates', 'bedrooms', 'review_scores_rating', 'review_scores_accuracy', 'minimum_nights',
               'maximum_nights', 'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 'review_scores_value'] # excluding latitude and longitude

# missing_attribs = ['host_total_listings_count', 'bedrooms', 'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness',
#                'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 'review_scores_value']

num_pipeline = make_pipeline(IterativeImputer(random_state = 42), StandardScaler())

# Dropping amenities for now

preprocess_pipeline = ColumnTransformer([
        ("cat", OneHotEncoder(drop="first"), cat_attribs),
        ("num", num_pipeline, num_attribs),
    ])

preprocess_pipeline

In [16]:
# preprocess the training data
tr_X = preprocess_pipeline.fit_transform(X_train)
tr_y = y_train

# preprocess the test data
t_X = preprocess_pipeline.transform(X_test)
t_y = y_test

# check sizes
X_train.shape, tr_X.shape, tr_y.shape, t_X.shape, t_y.shape

((223769, 19), (223769, 28), (223769,), (55943, 28), (55943,))

In [17]:
from sklearn.ensemble import HistGradientBoostingRegressor

# Creating a HistGradientBoostingRegressor with specified hyperparameters
hgb_clf = HistGradientBoostingRegressor(max_leaf_nodes=24, max_iter=80, learning_rate=0.07133747470888435)  # λ

# Fitting the regressor model on the training data
hgb_clf.fit(tr_X, tr_y)

# Printing the R2 score to evaluate the performance of the gradient boosting model on the test data
print(f'Gradient boosting with {hgb_clf.max_iter} trees leads to R2 score of {hgb_clf.score(t_X, t_y):.4f}.')

Gradient boosting with 80 trees leads to R2 score of 0.1407.


In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, loguniform
from sklearn.ensemble import HistGradientBoostingRegressor

# Define the parameter grid for hyperparameter tuning
param_grid = {'max_leaf_nodes': randint(10, 25),
              'max_iter': randint(50, 100),
              'learning_rate': loguniform(1e-3, 1e-1)}

# Create the HistGradientBoostingRegressor
hgb_clf = HistGradientBoostingRegressor(random_state=42)

# Set up RandomizedSearchCV
rand_search = RandomizedSearchCV(hgb_clf, param_distributions=param_grid,
                                 cv=5, n_iter=30, scoring='r2', random_state=42)

# Fit the model with the training data
rand_search.fit(tr_X, tr_y)

# Print the best hyperparameters
print("Best Hyperparameters:", rand_search.best_params_)

# Evaluate the model with the best hyperparameters on the test set
best_model = rand_search.best_estimator_
r2_score = best_model.score(t_X, t_y)
print(f'R2 score on the test set: {r2_score:.4f}')

In [None]:
# from sklearn.model_selection import RandomizedSearchCV
# from scipy.stats import loguniform, randint
# from sklearn.metrics import make_scorer
# from sklearn.metrics import mean_squared_error



# param_grid = {'max_leaf_nodes': randint(2, 16),
#               'max_iter': randint(2, 32),
#               'learning_rate': loguniform(1e-2, 1)}

# scorer = make_scorer(mean_squared_error, greater_is_better=False)
# rand_search = RandomizedSearchCV(HistGradientBoostingRegressor(random_state=42),
#                                  param_grid, cv=5, n_iter=30, scoring=scorer,
#                                  random_state=42)

# rand_search.fit(tr_X, tr_y)
# rand_cv_res = pd.DataFrame(rand_search.cv_results_)
# rand_cv_res.sort_values(by="mean_test_score", ascending=True, inplace=True)
# rand_cv_res.filter(regex='(^param_|mean_test_score)', axis=1).head()

In [None]:
from xgboost import XGBRegressor
# We'll need to use a portion of the training data as 'validation' data to determine how many trees to grow
tr_X_tr, tr_X_v, tr_y_tr, tr_y_v = train_test_split(tr_X, tr_y, test_size = .2, random_state=0)

bst = XGBRegressor(n_estimators=300, # up to 100 trees
                    max_leaves=16, # each will have at most 16 leaves
                    learning_rate=0.05,
                    eval_metric='rmse',
                    num_parallel_tree = 2,
                    colsample_bytree=0.6,
                    subsample=0.6,
                    early_stopping_rounds=10)
bst.fit(tr_X_tr, tr_y_tr, eval_set=[(tr_X_v, tr_y_v)], verbose=False)

print(f'Root Mean Squared Error of XGBoost is {bst.score(t_X, t_y):.4f}.')

In [None]:
# # Checking data after pre-processing
# print(X_train.shape)
# X_train_prepared = preprocess_pipeline.fit_transform(X_train)
# print(X_train_prepared.shape)

In [None]:
# # Checking new column names

# preprocess_pipeline.get_feature_names_out()

In [None]:
# X_train_prepared_df = pd.DataFrame(X_train_prepared, # the numpy array containing the processed data
#                                    columns=preprocess_pipeline.get_feature_names_out(), # column names
#                                    index=X_train.index # row numbers/labels
#                                    )
# X_train_prepared_df.isna().sum()

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

lin_reg = make_pipeline(preprocess_pipeline, LinearRegression())
lin_reg.fit(X_train, y_train)
y_train_predictions = lin_reg.predict(X_train)

lin_rmse = mean_squared_error(y_train, y_train_predictions, squared=False)
print(f"The training data RMSE is {lin_rmse:.0f} or about {(lin_rmse/y_train.mean()*100):.0f}% error")

In [None]:
from sklearn.metrics import r2_score

print(f'R-squared score from Linear Regression model is {r2_score(y_train, y_train_predictions):.3f}')

In [None]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = make_pipeline(preprocess_pipeline, DecisionTreeRegressor(random_state=42))
tree_reg.fit(X_train, y_train)
y_train_predictions = tree_reg.predict(X_train)
tree_rmse = mean_squared_error(y_train, y_train_predictions, squared=False)
print(f'Training data error for the tree {tree_rmse:.0f}')

In [None]:
#from sklearn.metrics import r2_score

print(f'R-squared score from Decision Tree model is {r2_score(y_train, y_train_predictions):.3f}')

In [None]:
from sklearn.model_selection import cross_val_score

# First for the linear regression
lin_cv_rmses = -cross_val_score(lin_reg, X_train, y_train,
                              scoring="neg_root_mean_squared_error", cv=3)
print(f"Average Linear Regression Cross-Validation RMSE: {lin_cv_rmses.mean():.0f}")

In [None]:
# Then the decision tree regressor

tree_cv_rmses = -cross_val_score(tree_reg, X_train, y_train,
                              scoring="neg_root_mean_squared_error", cv=3)
print(f"Average Decision Tree Regression Cross-Validation RMSE: {tree_cv_rmses.mean():.0f}")