## Prices Prediction for new Airbnb Hosts in Bay Area

This notebook focuses on data EDA, feature engineering & preprocssing and modeling on listing price decisions of a new airbnb host from Bay area. 


In [None]:
import pandas as pd
import numpy as np
from geopy.distance import great_circle
import matplotlib.pyplot as plt
plt.style.use('seaborn')
import seaborn as sns
import re
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model
import statsmodels.api as sm
from collections import Counter
import pickle
import shap
shap.initjs()
import warnings
warnings.filterwarnings("ignore")


pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', -1) 

#### 1.1 Read in the data and explore the data

In [None]:
df_listings = pd.read_excel('dr.xlsx',index_col=0)
print("The dataset has {} rows and {} columns.".format(*df_listings.shape))
print("The dataset contains {} duplicates.".format(df_listings.duplicated().sum()))

In [None]:
ids = df_listings.index
df_listings[ids.isin(ids[ids.duplicated()])]

In [None]:
df_listings.head(5).T

In [None]:
df_listings.columns.values

#### 1.2 Select potential features based on availablity for new Airbnb host and intuition when people search for an airbnb.

In [None]:
# identify potential useful columns
columns_to_keep = ['Weight','dim group1','UPS Ground DR','Freight DR']

In [None]:
df_raw = df_listings[columns_to_keep]
print("The dataset has {} rows and {} columns - after dropping irrelevant columns.".format(*df_raw.shape))

In [None]:
df_raw.head()

In [None]:
df_raw.isna().sum()

In [None]:
df_raw.dropna(inplace=True)

In [None]:
df_raw.isna().sum()

In [None]:
df_raw['Weight'].plot(kind='box', xlim=(0, 200), vert=False, flierprops=dict(markerfacecolor='r', markeredgecolor='r', marker='x'), figsize=(18,2));

In [None]:
df_raw['dim group1'].plot(kind='box', xlim=(0, 200), vert=False, flierprops=dict(markerfacecolor='r', markeredgecolor='r', marker='x'), figsize=(18,2));

In [None]:
df_raw['UPS Ground DR'].plot(kind='box', xlim=(0, 1), vert=False, flierprops=dict(markerfacecolor='r', markeredgecolor='r', marker='x'), figsize=(18,2));

In [None]:
df_raw['Freight DR'].plot(kind='box', xlim=(0, 0.1), vert=False, flierprops=dict(markerfacecolor='r', markeredgecolor='r', marker='x'), figsize=(18,2));

In [None]:
df1 = df_raw[df_raw['Weight']>50]

In [None]:
df1.plot(x='Weight', y='UPS Ground DR', style='o')

#### 1.3 Clean Pricing data

In [None]:
df_raw[['price','security_deposit', 'cleaning_fee', 'extra_people']].head(5)

In [None]:
df_raw[['price','security_deposit', 'cleaning_fee', 'extra_people']].isna().sum()

It makes sense to assume that missing values for 'security_deposit' and 'cleaning_fee' are '$0', because the airbnb does not charge 'security_deposit' and 'cleaning_fee'

In [None]:
df_raw.fillna({'security_deposit':'$0.00','cleaning_fee':'$0.00' }, inplace=True)

In [None]:
df_raw[['price','security_deposit', 'cleaning_fee', 'extra_people']].isna().sum()

In [None]:
df_raw.price = df_raw.price.str.replace('$', '').str.replace(',', '').astype(float)
df_raw.security_deposit = df_raw.security_deposit.str.replace('$', '').str.replace(',', '').astype(float)
df_raw.cleaning_fee = df_raw.cleaning_fee.str.replace('$', '').str.replace(',', '').astype(float)
df_raw.extra_people = df_raw.extra_people.str.replace('$', '').str.replace(',', '').astype(float)

In [None]:
df_raw['price'].plot(kind='box', xlim=(0, 1000), vert=False, flierprops=dict(markerfacecolor='r', markeredgecolor='r', marker='x'), figsize=(18,2));

In [None]:
df_raw.drop(df_raw[(df_raw.price > 400) | (df_raw.price == 0)].index, axis=0, inplace=True)
print("The dataset has {} rows and {} columns - after being price-wise preprocessed.".format(*df_raw.shape))

In [None]:
plt.hist(df_raw['UPS Ground DR'],bins=100, density = True, 
         histtype ='bar', color = 'blue')  
plt.title('Histogram of price')  
plt.show()

#### 1.4 Explore realationship between price and other key features

In [None]:
df_raw.room_type.value_counts(normalize=True)

In [None]:
df_raw.boxplot(column='price', by='room_type', 
           flierprops=dict(markerfacecolor='y', markeredgecolor='y', marker='o'), vert=True, figsize=(10,8))
plt.xlabel('Room_type', fontsize=12)
plt.ylabel('Price', fontsize=12)
plt.title('Price vs Room_type', fontsize=14, fontweight='bold')
plt.suptitle('');

#### Significant price difference could be identified for different room types. 

In [None]:
df_raw.property_type.value_counts(normalize=True)

In [None]:
df_raw.plot(kind="scatter", x="longitude", y="latitude", alpha=0.5, figsize=(10,8), 
        c="price", cmap="gist_heat_r", colorbar=True, sharex=False);

#### Create a new distance feature based on the distance of the location to center of the city.

In [None]:
def distance_to_mid(lat, lon):
    city_centre = (37.45, -122.15)
    accommodation = (lat, lon)
    return great_circle(city_centre, accommodation).mi

In [None]:
df_raw['distance'] = df_raw.apply(lambda x: distance_to_mid(x.latitude, x.longitude), axis=1)

In [None]:
sns.set(rc={'figure.figsize':(20,10)})
viz = sns.violinplot(data=df_raw, x='neighbourhood_cleansed', y='price')
viz.set_xticklabels(viz.get_xticklabels(), rotation=90)
viz.set_title('Density and distribution of prices for each neighberhood')

In [None]:
df_raw.boxplot(column='price', by='cancellation_policy', 
           flierprops=dict(markerfacecolor='g', markeredgecolor='g', marker='.'), vert=True, figsize=(10,8))
plt.xlabel('Policy', fontsize=12)
plt.ylabel('Price', fontsize=12)
plt.title('Prices by cancellation_policy', fontsize=14, fontweight='bold')
plt.suptitle('');

Probably not significant difference among different policies.

In [None]:
df_raw.isna().sum()

### 1.5 Drop columns with too many missing values

In [None]:
df_raw.drop(columns=['space','square_feet', 'review_scores_rating'], inplace=True)
df_raw.dropna(subset=['bathrooms', 'bedrooms'], inplace=True)
print("The dataset has {} rows and {} columns.".format(*df_raw.shape))

In [None]:
df_raw.isna().sum()

In [None]:
df_raw.head(2).T

#### 1.6 Feature engineering on 'ameninites'

In [None]:
results = Counter()
df_raw['amenities'].str.strip('{}')\
               .str.replace('"', '')\
               .str.lstrip('\"')\
               .str.rstrip('\"')\
               .str.split(',')\
               .apply(results.update)

results.most_common(50)

In [None]:
df_raw['Laptop_friendly_workspace'] = df_raw['amenities'].str.contains('Laptop friendly workspace')
df_raw['TV'] = df_raw['amenities'].str.contains('TV')
df_raw['Free_parking'] = df_raw['amenities'].str.contains('Free parking on premises|Free street parking')
df_raw['Family_kid_friendly'] = df_raw['amenities'].str.contains('Family/kid friendly')
df_raw['Pool'] = df_raw['amenities'].str.contains('Pool')
df_raw['Pets_allowed']= df_raw['amenities'].str.contains('Pets allowed')
df_raw['Smoking_allowed'] = df_raw['amenities'].str.contains('Smoking allowed')

In [None]:
df_raw.info()

#### 1.7 Converte the type of categorical features

In [None]:
for col in ['host_has_profile_pic', 'neighbourhood_cleansed','room_type','host_is_superhost', 'property_type', 'bed_type', 'instant_bookable', 
            'is_business_travel_ready', 'cancellation_policy']:
    df_raw[col] = df_raw[col].astype('category')

In [None]:
df_raw.info()

### 2.1 Prepare data for model development

In [None]:
target = df_raw[["price"]]
features = df_raw.drop(["price"], axis=1)

In [None]:
num_feats = features.select_dtypes(include=['float64', 'int64', 'bool']).copy()
cat_feats = features.select_dtypes(include=['category']).copy()
cat_feats = pd.get_dummies(cat_feats)
features = pd.concat([num_feats, cat_feats], axis=1)
print(features.shape)
features.head(5)

#### 2.2 Split the train and test data set

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2)

In [None]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test  = sc.transform(X_test)

#### 2.3 Train a baseline linear regression with L1 regularization and xgboost model 

In [None]:
clr = linear_model.Lasso(alpha= 0.1)
clr.fit(X_train, y_train)
y_train_pred0 = clr.predict(X_train)
y_test_pred0 = clr.predict(X_test)

In [None]:
RMSE0 = np.sqrt(mean_squared_error(y_test, y_test_pred0))
print(f"RMSE: {round(RMSE0, 4)}")
r20 = r2_score(y_test, y_test_pred0)
print(f"r2: {round(r20, 4)}")

In [None]:
# create a baseline xgboost model
xgb_model = xgb.XGBRegressor()
xgb_model.fit(X_train, y_train)
y_pred_train = xgb_model.predict(X_train)
y_pred_test = xgb_model.predict(X_test)

In [None]:
RMSE = np.sqrt(mean_squared_error(y_test, y_pred_test))
print(f"RMSE: {round(RMSE, 4)}")
r2 = r2_score(y_test, y_pred_test)
print(f"r2: {round(r2, 4)}")

The XGBoost model perform much better than the linear regression model

#### 2.4 Grid search of xgboost model with cross-validation

In [None]:
param_grid = {'n_estimators': [100, 150, 200],
              'learning_rate': [0.01, 0.05, 0.1], 
              'max_depth': [3, 5, 7, 9],
              'colsample_bytree': [0.4,0.6, 0.7, 1],
              'gamma': [0.0, 0.1, 0.2]}

xgb_model_grid_search = GridSearchCV(xgb_model, param_grid, cv=5, n_jobs=-1)
xgb_model_grid_search.fit(X_train, y_train)
print(xgb_model_grid_search.best_params_)

In [None]:
xgb_model = xgb.XGBRegressor(colsample_bytree=0.4, gamma=0.1, learning_rate=0.05, 
                           max_depth=9, n_estimators=200, random_state=3)

xgb_model.fit(X_train, y_train)
y_pred_train = xgb_model.predict(X_train)
y_pred_test = xgb_model.predict(X_test)

In [None]:
pickle.dump(xgb_model, open('xgboost_model.pickle', 'wb'))

In [None]:
RMSE = np.sqrt(mean_squared_error(y_test, y_pred_test))
print(f"RMSE: {round(RMSE, 4)}")
r2 = r2_score(y_test, y_pred_test)
print(f"r2: {round(r2, 4)}")

#### Generate CV result using selected hyperparameter

In [None]:
xg_train = xgb.DMatrix(data=X_train, label=y_train)

In [None]:
params = {'colsample_bytree':0.4, 'gamma':0.1, 'learning_rate':0.05, 'max_depth':9}

cv_results = xgb.cv(dtrain=xg_train, params=params, nfold=5,
                    num_boost_round=200, early_stopping_rounds=10, 
                    metrics="rmse", as_pandas=True)

In [None]:
cv_results.head()

In [None]:
cv_results.tail()

In [None]:
# plot the important features
feat_importances = pd.Series(booster.feature_importances_, index=features.columns)
feat_importances.nlargest(20).sort_values().plot(kind='barh', color='darkgrey', figsize=(10,5))
plt.xlabel('Relative Feature Importance with XGBoost');

#### 3.1 Create SHAP explainer

In [None]:
shap.initjs()

In [None]:
explainer = shap.TreeExplainer(booster)
shap_values = explainer.shap_values(X_train)

In [None]:
shap.force_plot(explainer.expected_value, shap_values, X_train,feature_names = features.columns)

In [None]:

shap.summary_plot(shap_values, X_train,feature_names = features.columns)

In [None]:
shap.summary_plot(shap_values, X_train, plot_type="bar",feature_names = features.columns)

In [None]:
x = list(range(46,500))

In [None]:
x = [_/100 for _ in x]

In [None]:
def relate(x):
    y = 0.000443*x**2+0.0487*100+0.073331-0.016356*100*x**(1/3)
    return y

In [None]:
y = list(map(relate,x))

In [None]:
y