In [None]:
import sklearn as sk
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
data = pd.read_csv("../input/us-airbnb-open-data/AB_US_2020.csv")

In [None]:
data.dtypes

In [None]:
data.head(3)

In [None]:
len(data.city.unique())

In [None]:
data.city.value_counts()

Lets first see if we can predict City from the Lat/Lon

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

def trainCityModel(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.6)
    cityModel = DecisionTreeClassifier(min_samples_leaf = 25, max_depth = 25)
    cityModel.fit(X_train, y_train)
    print(f"f1 of test: {f1_score(y_test, cityModel.predict(X_test), average = 'weighted')}")
    print(f"f1 of train: {f1_score(y_train, cityModel.predict(X_train), average = 'weighted')}")
    return cityModel

In [None]:
cityModel = trainCityModel(y = data.city, X = data.loc[:, ['latitude', 'longitude']])

In [None]:
sns.scatterplot(x=data['latitude'], y=data['longitude'], hue=data['city'])

In [None]:
sns.scatterplot(x=data['latitude'], y=data['longitude'], hue=cityModel.predict(data.loc[:, ['latitude', 'longitude']]))

We conclude that City can be freely dropped, as in principle we can predict it from the Coords

Now we look into Room Type and Neighbourhoods

In [None]:
data.room_type.value_counts()

In [None]:
data.neighbourhood_group.value_counts()

In [None]:
hoodGroupCityModel = trainCityModel(pd.get_dummies(data.loc[:, ['city']]), data.neighbourhood_group.fillna('Other neighborhoods'))

In [None]:
cityHoodGroupModel = trainCityModel(pd.get_dummies(data.loc[:, ['neighbourhood_group']].fillna('Other neighborhoods')), data.city)

In [None]:
hoodGroupModel = trainCityModel(data.loc[:, ['latitude', 'longitude']], data.neighbourhood_group.fillna('Other neighborhoods'))

In [None]:
data.neighbourhood.value_counts()

In [None]:
hoodModel = trainCityModel(data.loc[:, ['latitude', 'longitude']], data.neighbourhood.fillna('Unincorporated Areas'))

In [None]:
from sklearn.dummy import DummyRegressor
from sklearn.metrics import mean_absolute_error
dummyPriceRegressor = DummyRegressor()
dummyPriceRegressor.fit(X = data, y = data.price) # we don't mind the leak here
mean_absolute_error(data.price, dummyPriceRegressor.predict(data))

In [None]:
from sklearn.tree import DecisionTreeRegressor
def trainNaiveRegressor(X, y, regressor = DecisionTreeRegressor(min_samples_leaf = 25, max_depth = 25), test_size = 0.6):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size)
    regressor.fit(X_train, y_train)
    print(f"mae of test: {mean_absolute_error(y_test, regressor.predict(X_test))}")
    print(f"mae of train: {mean_absolute_error(y_train, regressor.predict(X_train))}")
    return regressor

In [None]:
naiveCityPriceCorrelation = trainNaiveRegressor(pd.get_dummies(data.loc[:, ['city']]), data.price)

In [None]:
naiveNGPriceCorrelation = trainNaiveRegressor(pd.get_dummies(data.loc[:, ['neighbourhood_group']].fillna('Other neighborhoods')), data.price)


In [None]:
naiveHoodPriceCorrelation = trainNaiveRegressor(pd.get_dummies(data.loc[:, ['neighbourhood']].fillna('Unincorporated Areas')), data.price)


In [None]:
geoClusters = 200
from sklearn.cluster import KMeans
clusteringModel = KMeans(n_clusters=geoClusters, init='k-means++')
geoClusterDummies = pd.get_dummies(clusteringModel.fit_predict(data.loc[:, ['latitude', 'longitude']]), prefix = 'geo')
trainNaiveRegressor(geoClusterDummies, data.price)

Neighbourhood Group seems very similar to city -- also predictable from the Geo cords, even though not directly corresponding

Neighbourhood is harder to predict as it apparently has much more distinct values, and its kinda longer-tailed

All of the City, NG and Hood itself perform poorly when naively dummied and used to predict price; with 200-geo-clusters perfoming a tiny bit better. Therefore, we will drop those three.

Room type is just 4 categorial values, no need to think here.

In [None]:
data.last_review.isna().sum()

In [None]:
data[data.last_review.isna()].number_of_reviews.sum()

In [None]:
data1 = pd.get_dummies(data\
  .assign(days_since_last_review = (pd.to_datetime('2020-11-09') - pd.to_datetime(data.last_review.fillna('01/01/00'))).map(lambda x : x.days))\
  .drop(['name', 'host_name', 'city', 'neighbourhood', 'neighbourhood_group', 'last_review'], axis = 'columns')\
  .fillna({'reviews_per_month': 0})
, columns = ['room_type'])

Now we have gotten rid of all Object columns.

We will now get rid of those Numer ones which do not correspond to Ordinals: Id, host id, Geo

Id is clear dropper, Host Id we will consider dummification, Geo is for clustering & dummification

In [None]:
len(data1.id.unique())

In [None]:
hostCounts = data1.host_id.value_counts()
hostCounts.describe()

In [None]:
hostCounts.value_counts()[0:20]

In [None]:
hostIdAggs = data1.groupby('host_id').agg({'host_id': ['count'], 'price': ['mean', 'std']})
hostIdAggs[hostIdAggs.host_id['count'] > 1].describe()

In [None]:
hostIdAggs.columns = hostIdAggs.columns.to_flat_index()
hostIdAggs.drop([('price', 'std')], axis = 'columns').describe()

In [None]:
hostIdAggs.drop([('price', 'std')], axis = 'columns').corr()

In [None]:
dummyAvgPriceRegressor = DummyRegressor()
dummyAvgPriceRegressor.fit(X = hostIdAggs, y = hostIdAggs[('price', 'mean')]) # we don't mind the leak here
mean_absolute_error(hostIdAggs[('price', 'mean')], dummyPriceRegressor.predict(hostIdAggs))

In [None]:
regressor = DecisionTreeRegressor(min_samples_leaf = 2, max_depth = 2)
naiveHousesToPrice = trainNaiveRegressor(X = hostIdAggs.loc[:, [('host_id', 'count')]], y = hostIdAggs[('price', 'mean')], regressor = regressor)

from sklearn.tree import export_graphviz
import graphviz

treeGraph = export_graphviz(naiveHousesToPrice, out_file=None, feature_names=[('host_id', 'count')])
graphviz.Source(treeGraph)

In [None]:
bigHosts = hostCounts[hostCounts > 78].index # we are comitting a potential leak here, would have to be considered in temporal context
mediumHosts = hostCounts[(hostCounts <= 78) & (hostCounts > 18)].index
smallishHosts = hostCounts[(hostCounts <= 18) & (hostCounts > 1)].index
firstTimers = hostCounts[hostCounts <= 1].index
hostSizeConstructor = [(bigHosts, 'big'), (mediumHosts, 'medium'), (smallishHosts, 'smallish'), (firstTimers, 'firstTimer')]
hostSizeFeature = pd.concat([pd.Series([element[1]]*len(element[0]), index = element[0]) for element in hostSizeConstructor], axis = 'rows')
hostSizeFeature.value_counts()

It seems that the number of listings of the Host does have some value. A decision tree was able to predict the average price with a little better performance than dummy -> we use that tree to derive buckets for the Feature

We have an explicit assumption here that this feature can be computed -- for example, this would be satisfied if the hosts have to submit entries only once and in a batch.

*Post-hoc edit*: Uh-ok, later on I've found that the calculated host listings count is exactly the group by & sum feature...

Furthermore, we simply re-use the 200-clusters for Geo

In [None]:
hostSizeDummies = pd.get_dummies(hostSizeFeature, prefix = 'hostSize')

In [None]:
data1.dtypes

In [None]:
data2 = data1.join(geoClusterDummies).join(hostSizeDummies, on = 'host_id').drop(['latitude', 'longitude', 'host_id', 'id'], axis = 'columns')
data2.describe()


In [None]:
fullX = data2.drop(['price'], axis = 'columns')
fullY = data2.price
firstFullAttempt = trainNaiveRegressor(X = fullX, y = fullY)

In [None]:
import eli5
from eli5.sklearn import PermutationImportance

permSampler = data2.sample(frac = 0.1)
permSamplerX = permSampler.drop(['price'], axis = 'columns')
permSamplerY = permSampler.price

perm = PermutationImportance(firstFullAttempt).fit(permSamplerX, permSamplerY)
display(eli5.show_weights(perm, feature_names = fullX.columns.tolist()))

In [None]:
firstFullAttemptWithoutListingsCount = trainNaiveRegressor(X = fullX.drop('calculated_host_listings_count', axis = 'columns'), y = fullY)


In [None]:
import shap

shapSamples = permSamplerX.sample(n = 10)
explainer = shap.TreeExplainer(firstFullAttempt)
shapValues = explainer.shap_values(shapSamples)
shap.initjs()
shap.force_plot(explainer.expected_value[0], shapValues[0], shapSamples.iloc[0, :])

In [None]:
shap.force_plot(explainer.expected_value[0], shapValues[1], shapSamples.iloc[1, :])

In [None]:
shap.force_plot(explainer.expected_value[0], shapValues[2], shapSamples.iloc[2, :])

In [None]:
redundantFeatures = ['hostSize_' + x[1] for x in hostSizeConstructor] + ['number_of_reviews']
firstFullAttemptWithoutRedundantFeatures = trainNaiveRegressor(X = fullX.drop(redundantFeatures, axis = 'columns'), y = fullY)


In [None]:
data2.loc[:, ['number_of_reviews', 'reviews_per_month']].corr()

A straightforward trained tree yields somehow better performance than dummy; but suffers from some train-test discrepancy.

Listings count turns out to be a significant feature; its omission drops performance despite those bucket variables still being there. Getting rid of the bucket variables and the total number of reviews (while keeping the fairly correlated reviews per month) yields a little smaller train-test performance discrepancy.


In [None]:
from sklearn.ensemble import RandomForestRegressor
rfAttempt_1 = trainNaiveRegressor(X = fullX, y = fullY, regressor = RandomForestRegressor(n_estimators = 8, max_depth = 30), test_size = 0.2)


In [None]:
from sklearn.ensemble import RandomForestRegressor
rfAttempt_2 = trainNaiveRegressor(X = fullX, y = fullY, regressor = RandomForestRegressor(n_estimators = 16, max_depth = 24), test_size = 0.3)


In [None]:
from sklearn.ensemble import RandomForestRegressor
rfAttempt_3 = trainNaiveRegressor(X = fullX.drop(redundantFeatures, axis = 'columns'), y = fullY, regressor = RandomForestRegressor(n_estimators = 8, max_depth = 38), test_size = 0.2)


In [None]:
from sklearn.ensemble import GradientBoostingRegressor
gbAttempt_1 = trainNaiveRegressor(X = fullX, y = fullY, regressor = GradientBoostingRegressor(), test_size = 0.2)

With Random Forests, the train-test performance discrepancy widens much more :/

Gradient Boosting, with default parameters, performs worse than a simple tree (but at least no discrepancy)

In [None]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import TransformedTargetRegressor

scoringStrategy = 'neg_mean_absolute_error'

def evalModelPipeline(preproc, model, targetReg = True, X = fullX.drop(redundantFeatures, axis = 'columns'), y = fullY):
    if targetReg:
        model = TransformedTargetRegressor(regressor=model, transformer=MinMaxScaler())
    pipeline = Pipeline(steps = [('preproc', preproc), ('model', model)])
    scores = -1 * cross_val_score(pipeline, X, y,
                              cv=5,
                              scoring=scoringStrategy)
    print(scores)
    return scores

stdPreproc = ColumnTransformer(transformers = [], remainder = StandardScaler())
mmPreproc = ColumnTransformer(transformers = [], remainder = MinMaxScaler())
noop = ColumnTransformer(transformers = [], remainder = 'passthrough')

In [None]:
from sklearn.linear_model import LinearRegression, HuberRegressor
evalModelPipeline(noop, LinearRegression(), targetReg = False)
evalModelPipeline(stdPreproc, LinearRegression(), targetReg = False)
evalModelPipeline(stdPreproc, LinearRegression())

In [None]:
evalModelPipeline(mmPreproc, LinearRegression(), targetReg = False)
evalModelPipeline(mmPreproc, LinearRegression())

Oh, I guess I don't really know how to use Linear Regression :(

Why are the values off by multiple orders of magnitude is beyond my knowledge

In [None]:
# this is rather just a sanity check
evalModelPipeline(noop, DecisionTreeRegressor(max_depth = 32), targetReg = False)
evalModelPipeline(mmPreproc, DecisionTreeRegressor(max_depth = 32), targetReg = False)

Interestingly enough, there are some non trivial variatons here... I wonder what is causing that.