In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
plt.style.use('seaborn')
import seaborn as sns

import re
import xgboost as xgb

import warnings
warnings.filterwarnings("ignore")

In [2]:
import utils
df_raw = pd.read_csv('data/summary_listings.csv', low_memory=False, dtype=utils.schema)

In [3]:
TARGET = 'price'

# Imputing Data

In [4]:
df_train = df_raw.drop(columns=['neighbourhood_group', 'name', 'host_name', 'last_review'])
df_train.reviews_per_month.fillna(value=0, inplace=True)
df_train['neighbourhood'] = df_train.neighbourhood.astype('category')
df_train['room_type'] = df_train.room_type.astype('category')
df_train.dtypes

id                                   int64
host_id                              int64
neighbourhood                     category
latitude                           float64
longitude                          float64
room_type                         category
price                                int64
minimum_nights                       int64
number_of_reviews                    int64
reviews_per_month                  float64
calculated_host_listings_count       int64
availability_365                     int64
dtype: object

## Split data based on categorical column
* level one splitting: neighbourhood
* level 2 splitting: room_type

In [5]:
cat_feats = df_train.select_dtypes(include=['category']).copy()
unique_cat_feats = cat_feats.drop_duplicates()
unique_neighbourhoods = list(unique_cat_feats['neighbourhood'].drop_duplicates())
unique_room_types = list(unique_cat_feats['room_type'].drop_duplicates())

hierarchical_dfs = {}
for neighborhood in unique_neighbourhoods:
    df_neighborhood = df_train[df_train['neighbourhood'] == neighborhood]
    hierarchical_dfs[neighborhood] = {}
    for room_type in unique_room_types:
        sanitized_room_type = room_type.replace(" ", "_").replace("/", "_")
        hierarchical_df = df_neighborhood[df_neighborhood['room_type'] == room_type]
        if hierarchical_df.shape[0] > 0:
            hierarchical_dfs[neighborhood][sanitized_room_type] = hierarchical_df.drop(columns = ['neighbourhood', 'room_type']).copy()
            print("{} / {} has {} rows".format(neighborhood, sanitized_room_type, hierarchical_df.shape[0]))

松山區 / Private_room has 1102 rows
松山區 / Hotel_room has 205 rows
松山區 / Entire_home_apt has 1449 rows
松山區 / Shared_room has 119 rows
中正區 / Private_room has 3300 rows
中正區 / Hotel_room has 621 rows
中正區 / Entire_home_apt has 3715 rows
中正區 / Shared_room has 1203 rows
文山區 / Private_room has 520 rows
文山區 / Hotel_room has 28 rows
文山區 / Entire_home_apt has 445 rows
文山區 / Shared_room has 74 rows
大安區 / Private_room has 2839 rows
大安區 / Hotel_room has 122 rows
大安區 / Entire_home_apt has 4888 rows
大安區 / Shared_room has 554 rows
中山區 / Private_room has 2357 rows
中山區 / Hotel_room has 931 rows
中山區 / Entire_home_apt has 3015 rows
中山區 / Shared_room has 236 rows
南港區 / Private_room has 281 rows
南港區 / Hotel_room has 24 rows
南港區 / Entire_home_apt has 222 rows
南港區 / Shared_room has 113 rows
士林區 / Private_room has 739 rows
士林區 / Hotel_room has 15 rows
士林區 / Entire_home_apt has 863 rows
士林區 / Shared_room has 219 rows
內湖區 / Private_room has 469 rows
內湖區 / Entire_home_apt has 394 rows
內湖區 / Shared_room has 82 rows
萬華

# Hierarchical Model Training

In [6]:
from datetime import datetime
from os.path import join
from pathlib import Path

bestparams = {
    'colsample_bytree': 0.7,
    'gamma': 0.2,
    'learning_rate': 0.1,
    'max_depth': 10,
    'n_estimators': 500
}

model = {}
multi_model_folder = "model/multimodel_{}".format(datetime.now().strftime("%Y%m%d_%H%M%S"))
Path(multi_model_folder).mkdir(parents=True)

for neighborhood, room_type_dfs in hierarchical_dfs.items():
    for room_type, df in hierarchical_dfs[neighborhood].items():
        print("training for {} / {}".format(neighborhood, room_type))
        y_train = df[TARGET]
        X_train = df.drop(columns=[TARGET])
        # instantiate xgboost with best parameters
        best_booster = xgb.XGBRegressor(
            colsample_bytree=bestparams['colsample_bytree'],
            gamma=bestparams['gamma'],
            learning_rate=bestparams['learning_rate'],
            max_depth=bestparams['max_depth'],
            n_estimators=bestparams['n_estimators'],
            random_state=4,
            tree_method='gpu_hist',
            gpu_id=0
        )
        
        # train
        best_booster.fit(X_train, y_train)
        model_name = "model.json"
        
        # export model
        model_folder = join(multi_model_folder, neighborhood, room_type)
        Path(model_folder).mkdir(parents=True)
        model_path = join(model_folder, model_name)
        best_booster.save_model(model_path)

training for 松山區 / Private_room
training for 松山區 / Hotel_room
training for 松山區 / Entire_home_apt
training for 松山區 / Shared_room
training for 中正區 / Private_room
training for 中正區 / Hotel_room
training for 中正區 / Entire_home_apt
training for 中正區 / Shared_room
training for 文山區 / Private_room
training for 文山區 / Hotel_room
training for 文山區 / Entire_home_apt
training for 文山區 / Shared_room
training for 大安區 / Private_room
training for 大安區 / Hotel_room
training for 大安區 / Entire_home_apt
training for 大安區 / Shared_room
training for 中山區 / Private_room
training for 中山區 / Hotel_room
training for 中山區 / Entire_home_apt
training for 中山區 / Shared_room
training for 南港區 / Private_room
training for 南港區 / Hotel_room
training for 南港區 / Entire_home_apt
training for 南港區 / Shared_room
training for 士林區 / Private_room
training for 士林區 / Hotel_room
training for 士林區 / Entire_home_apt
training for 士林區 / Shared_room
training for 內湖區 / Private_room
training for 內湖區 / Entire_home_apt
training for 內湖區 / Shared_room
traini

# Forecast with Holdout

In [7]:
holdout = pd.read_csv('data/holdout/listings202103.csv', low_memory=False, dtype=utils.schema)

In [8]:
from sklearn.metrics import mean_squared_error, r2_score
holdout = holdout.drop(columns=['neighbourhood_group', 'name', 'host_name', 'last_review'])
holdout.reviews_per_month.fillna(value=0, inplace=True)
holdout['neighbourhood'] = holdout.neighbourhood.astype('category')
holdout['room_type'] = holdout.room_type.astype('category')

cat_feats = holdout.select_dtypes(include=['category']).copy()
unique_cat_feats = cat_feats.drop_duplicates()
unique_neighbourhoods = list(unique_cat_feats['neighbourhood'].drop_duplicates())
unique_room_types = list(unique_cat_feats['room_type'].drop_duplicates())

hierarchical_score = {}
for neighborhood in unique_neighbourhoods:
    df_neighborhood = holdout[holdout['neighbourhood'] == neighborhood]
    hierarchical_score[neighborhood] = {}
    for room_type in unique_room_types:
        sanitized_room_type = room_type.replace(" ", "_").replace("/", "_")
        hierarchical_df = df_neighborhood[df_neighborhood['room_type'] == room_type]
        if hierarchical_df.shape[0] > 0:
            hierarchical_holdout = hierarchical_df.drop(columns = ['neighbourhood', 'room_type']).copy()
            y_holdout = hierarchical_holdout[TARGET]
            hierarchical_holdout.drop(columns=[TARGET], inplace=True)
            # export model
            model_folder = join(multi_model_folder, neighborhood, sanitized_room_type)
            model_name = 'model.json'
            model_path = join(model_folder, model_name)
            model = xgb.XGBRegressor()
            model.load_model(model_path)
            y_pred = model.predict(hierarchical_holdout)
            RMSE = np.sqrt(mean_squared_error(y_holdout,y_pred))
            predictions = {
                'holdout': hierarchical_holdout,
                'y_pred': y_pred,
                'y_holdout': y_holdout,
                'RMSE': RMSE,
            }
            hierarchical_score[neighborhood][sanitized_room_type] = predictions

In [9]:
y_pred = []
y_holdout = []
for neighborhood, room_type_scores in hierarchical_score.items():
    for room_type, score in room_type_scores.items():
        y_pred.append(score['y_pred'])
        y_holdout.append(score['y_holdout'])
y_pred = np.concatenate(y_pred)
y_holdout = np.concatenate(y_holdout)
RMSE = np.sqrt(mean_squared_error(y_holdout,y_pred))
print(f"RMSE: {round(RMSE, 4)}")

RMSE: 4149.936


## Further Investigate Performance by Segment

In [10]:
y_pred = []
y_holdout = []
for neighborhood, room_type_scores in hierarchical_score.items():
    for room_type, score in room_type_scores.items():
        if score['RMSE'] < 10000:
            y_pred.append(score['y_pred'])
            y_holdout.append(score['y_holdout'])
        else:
            print("{} / {} has really bad performance: {}".format(neighborhood, room_type, score['RMSE']))
y_pred = np.concatenate(y_pred)
y_holdout = np.concatenate(y_holdout)
RMSE = np.sqrt(mean_squared_error(y_holdout,y_pred))
print(f"RMSE: {round(RMSE, 4)}")

松山區 / Private_room has really bad performance: 13973.357368491892
內湖區 / Private_room has really bad performance: 13023.306786451742
RMSE: 3465.0202


### Check on price distribution for the abnormal district

In [16]:
neighbor_condition = (df_train['neighbourhood'] == '松山區') | (df_train['neighbourhood'] == '內湖區')
room_type_cndition = df_train['room_type'] == 'Private room'
df_train[neighbor_condition & room_type_cndition][TARGET].describe()

count     1571.000000
mean      2068.362190
std       6358.816385
min        251.000000
25%        864.000000
50%       1193.000000
75%       1789.000000
max      99999.000000
Name: price, dtype: float64

In [17]:
neighbor_condition = (holdout['neighbourhood'] == '松山區') | (df_train['neighbourhood'] == '內湖區')
room_type_cndition = holdout['room_type'] == 'Private room'
holdout[neighbor_condition & room_type_cndition][TARGET].describe()

count       157.00000
mean       3810.33121
std       12636.79219
min         256.00000
25%        1146.00000
50%        1424.00000
75%        2526.00000
max      146557.00000
Name: price, dtype: float64

### Filter records contributing to bad performance

In [15]:
bad_score = hierarchical_score['松山區']['Private_room']
bad_holdout = bad_score['holdout']
bad_holdout[TARGET] = bad_score['y_holdout']
bad_holdout['y_pred'] = bad_score['y_pred']
bad_holdout = bad_holdout.drop(columns=["latitude", "longitude"])
diff = bad_score['y_holdout'] - bad_score['y_pred']
bad_holdout[diff * diff > 1.0e+8]

Unnamed: 0,id,host_id,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,price,y_pred
5107,47730011,175128252,1,0,0.0,37,363,146557,6992.318359
5110,47742212,175128252,1,0,0.0,37,274,40149,5372.949707
5227,48599863,175128252,1,0,0.0,37,274,49559,3777.898682


### Turned out the one single host is messing up with all of our prediction

In [26]:
df_raw[df_raw['host_id'] == 175128252][TARGET].describe()

count       70.000000
mean      6665.271429
std       8439.308054
min       1383.000000
25%       2051.750000
50%       3223.000000
75%       6627.500000
max      43129.000000
Name: price, dtype: float64

In [25]:
df_raw[df_raw['id'] == 47730011][TARGET].describe()

count        2.00000
mean     11157.00000
std        270.11479
min      10966.00000
25%      11061.50000
50%      11157.00000
75%      11252.50000
max      11348.00000
Name: price, dtype: float64