# AirBnB Seattle Data Deep-dive

**Goal of this notebook** is to provide data-driven answers to the following questions
 - Find family friendly neighborhoods 
 - Top reasons for getting high review scores 
 - Predict review scores

In [24]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import xgboost
from sklearn.model_selection import train_test_split #split
from sklearn.metrics import r2_score, mean_squared_error #metrics

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [25]:
# set display options
#pd.set_option('display.height', 1000)
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)
pd.set_option('display.width', 1000)


In [26]:
calendar_df = pd.read_csv('/kaggle/input/seattle/calendar.csv')
calendar_df.head(5)

In [27]:
calendar_df.columns

In [28]:
listings_df = pd.read_csv('/kaggle/input/seattle/listings.csv')
listings_df.head(3)

In [29]:
listings_df.columns

In [30]:
listings_calendar_merged = pd.merge(listings_df, calendar_df, left_on="id", right_on="listing_id")
listings_calendar_merged['month']=listings_calendar_merged.apply(lambda row: int(row['date'].split("-")[1]), axis = 1)
listings_calendar_merged['year']=listings_calendar_merged.apply(lambda row: int(row['date'].split("-")[0]), axis = 1)
listings_calendar_merged.head(5)

In [31]:
listings_calendar_merged['price'] = listings_calendar_merged['price_x'].astype(str)
listings_calendar_merged['price'] = listings_calendar_merged['price'].str.replace("[$, ]", "").astype("float")
listings_calendar_merged = listings_calendar_merged.drop(columns = ['price_x', 'price_y'])
listings_calendar_merged.head(5)

In [32]:
#listings_df = listings_calendar_merged.copy(deep=True)
#listings_df.shape

In [33]:
reviews_df = pd.read_csv('/kaggle/input/seattle/reviews.csv')
reviews_df.head(5)

In [34]:
reviews_df.columns

# Question 1: Find family friendly neighborhoods 

* **Goal: Based on different features in the seattle airbnb dataset, identify the  list of potentially family friendly neighborhoods**

In [35]:
# check for nan columns
print(listings_df.isna().any())

# check for nan frequency
plt.figure(figsize=[40,25])
listings_df.isna().sum().plot(kind="bar", rot=45)
plt.show()

In [36]:
listings_df['neighborhood_overview'].head(20)

In [37]:

listings_df_no_nan = listings_df.fillna('None')

family_df = listings_df_no_nan[listings_df_no_nan['neighborhood_overview'].str.contains("children", case=False)]
family_df.shape
listings_df_no_nan['neighborhood_overview'].str.contains("children", case=False)
listings_df_no_nan.shape

In [38]:
family_df.shape

In [39]:
import nltk
from nltk.stem.porter import *

In [40]:
stemmer = PorterStemmer()

In [41]:
family_synomyms = ['families', 'group', 'household', 'kids', 'children'] # , ]
stemmed_family_synomyms = [stemmer.stem(family_synomym) for family_synomym in family_synomyms]
stemmed_family_synomyms.append('family')
stemmed_family_synomyms.append('families')
print(stemmed_family_synomyms)
stemmed_family_synomyms_df = pd.DataFrame(stemmed_family_synomyms, columns=["synonyms"])
stemmed_family_synomyms_df['synonyms']

In [42]:
reviews_df_no_nan = reviews_df.fillna('None')
reviews_df_no_nan[reviews_df_no_nan['comments'].str.contains('family')]['listing_id']

In [43]:
#listings_df_no_nan['neighborhood_overview']
#print(listings_df_no_nan)
print(listings_df_no_nan.shape)
print(reviews_df_no_nan.shape)
result = list()
for synonym in family_synomyms:
    result.append(listings_df_no_nan[listings_df_no_nan['neighborhood_overview'].str.contains(synonym)])
    family_reviews = reviews_df_no_nan[reviews_df_no_nan['comments'].str.contains(synonym)]
    result.append(listings_df_no_nan.merge(family_reviews, left_on="id", right_on="id" ))


family_friendly_listings_df = pd.concat(result)
print(family_friendly_listings_df.shape)
family_friendly_listings_df.drop_duplicates(subset="id", keep=False, inplace=True)
print(family_friendly_listings_df.shape)

In [44]:
family_friendly_listings_df.to_csv('family_friendly_v2.csv')

In [45]:
family_friendly_listings_df.shape

In [46]:
family_friendly_host_neighbourhood_count = family_friendly_listings_df.groupby('host_neighbourhood')['host_neighbourhood'].count()
print(family_friendly_host_neighbourhood_count)
plt.figure(figsize=[40,25])
family_friendly_host_neighbourhood_count.plot(kind="bar", rot=45)
plt.show()

In [47]:
# above mean, family friendly neighborhoods
family_friendly_host_neighbourhood_count.loc[lambda x: x > family_friendly_host_neighbourhood_count.mean()].plot(kind='bar', title='Family Friendly Areas in Seattle', xlabel='Location Name', y='Number of AirBnb Rentals')

# **Question 2: top features contributing to reviews rating**

* **identify which features are more effective towards customer review ratings**

In [48]:
listings_df_no_nan_numeric = listings_df.fillna(0)
listings_df_no_nan_numeric['top_rated'] = (listings_df_no_nan_numeric['review_scores_rating'] * listings_df_no_nan_numeric['reviews_per_month'])/100
listings_df_no_nan_numeric.head(2)

In [49]:
#some useless columns: url, and unique value all of the rows. 
unique_value_columns=[]
url_columns=[]

for i in listings_df_no_nan_numeric.columns:
    
    if len((listings_df_no_nan_numeric[i]).unique())==1:
        print ('a un-used column because same value:', i, (listings_df_no_nan_numeric[i]).unique())
        unique_value_columns=unique_value_columns+[i]
    if 'url' in i:
        url_columns=url_columns+[i]
        
# Drop it.
listings_df_trimmed = listings_df_no_nan_numeric.drop(url_columns + unique_value_columns, axis = 1)

In [50]:
print(listings_df_trimmed.columns)

In [51]:
# find out all columns starting with 'review' and remove them
listings_df_trimmed.filter(regex='^review',axis=1).head()

In [53]:
#turn categorical columns into dummies
cat_columns = list(listings_df_trimmed.select_dtypes(include=['object']).columns)
print(f"categorical columns ={cat_columns}")
    
#for col in  cat_columns:
#    listings_df_trimmed = pd.concat([listings_df_trimmed.drop(col, axis=1), pd.get_dummies(listings_df_trimmed[col], prefix=col, prefix_sep='_', drop_first=True, dummy_na=True)], axis=1)
#drop listing_id and year columns
listings_df_trimmed = listings_df_trimmed.drop(columns = ['id', 'host_id', 'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 'review_scores_value', 'reviews_per_month'])
listings_df_trimmed = listings_df_trimmed.drop(columns = cat_columns, axis = 1)

In [54]:
listings_df_trimmed.head(3)

In [55]:
#prepare train and test datasets for modelling
TEST_SIZE = 0.25
RAND_STATE = 42

X = listings_df_trimmed.drop(columns = 'top_rated')
y = listings_df_trimmed[['top_rated']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = TEST_SIZE, random_state=RAND_STATE)

In [56]:
#train XGBoost model
xgb = xgboost.XGBRegressor(n_estimators=200, learning_rate=0.05, gamma=0, subsample=0.75,
                           colsample_bytree=1, max_depth=7)
xgb.fit(X_train,y_train)

y_train_preds = xgb.predict(X_train)
y_test_preds = xgb.predict(X_test)



In [57]:
#get feature importances from the model
headers = ["name", "score"]
values = sorted(zip(X_train.columns, xgb.feature_importances_), key=lambda x: x[1] * -1)
xgb_feature_importances = pd.DataFrame(values, columns = headers)

#plot feature importances for top 15 features
features = xgb_feature_importances['name'][:15]
y_pos = np.arange(len(features))
scores = xgb_feature_importances['score'][:15]
 
plt.figure(figsize=(10,5))
plt.bar(y_pos, scores, align='center', alpha=0.5)
plt.xticks(y_pos, features, rotation='vertical')
plt.ylabel('Rating / Score')
plt.xlabel('Data Features')
plt.title('Feature importances (XGBoost)')

plt.savefig('feature importances XGB.png')
 
plt.show()

# Question 3: Predicting Reviews Rating for AirBnb hosts

*  **after identifying the features, use the ML model to predicting customer reviews rating on a held-out test set**

In [58]:
print('MSE train: %.3f, test: %.3f' % (
        mean_squared_error(y_train, y_train_preds),
        mean_squared_error(y_test, y_test_preds)))
print('R^2 train: %.3f, test: %.3f' % (
        r2_score(y_train, y_train_preds),
        r2_score(y_test, y_test_preds)))

In [59]:
y_train_preds = y_train_preds.reshape(-1, 1)
print(f'train data shape{y_train_preds.shape}')

y_test_preds = y_test_preds.reshape(-1, 1)
print(f'test data shape{y_test_preds.shape}')

In [60]:
plt.scatter(y_train_preds,  y_train_preds - y_train,
            c='orange', marker='s', label='Train data')
plt.scatter(y_test_preds,  y_test_preds - y_test,
            c='lightblue', marker='o', label='Test data')
plt.xlabel('Predictions')
plt.ylabel('Residuals / Errors')
plt.legend(loc='lower right')
plt.show()


In [61]:
from math import sqrt
rmse_xgb=sqrt(mean_squared_error(y_test,y_test_preds))
rmse_xgb