## Solving Rental Listing Inquiries

### *Problem Statement:* 
Predicting apartment rental listing popularity based on the listing content

In [None]:
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
plt.style.use('ggplot')
# Reference for customizing plots : http://matplotlib.org/users/customizing.html
# print(plt.style.available)

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
# Kaggle dataset: 
# https://www.kaggle.com/c/two-sigma-connect-rental-listing-inquiries/data

### Loading the dataset provided
df_train = pd.read_json('../../random/data/rental_listing/train.json')
df_test = pd.read_json('../../random/data/rental_listing/test.json')

In [None]:
df_train.head()

In [None]:
# Train
df_train["no_photos"] = df_train["photos"].apply(len)
df_train["no_features"] = df_train["features"].apply(len)
df_train["created_transformed"] = pd.to_datetime(df_train["created"])
df_train["created_yr"] = df_train["created_transformed"].dt.year
df_train["created_day"] = df_train["created_transformed"].dt.day
df_train["created_month"] = df_train["created_transformed"].dt.month
df_train['created_month_YrMnth'] = 100*df_train["created_yr"] + df_train["created_month"]

# Test
df_test["no_photos"] = df_test["photos"].apply(len)
df_test["no_features"] = df_test["features"].apply(len)
df_test["created_transformed"] = pd.to_datetime(df_test["created"])
df_test["created_yr"] = df_test["created_transformed"].dt.year
df_test["created_day"] = df_test["created_transformed"].dt.day
df_test["created_month"] = df_test["created_transformed"].dt.month
df_test['created_month_YrMnth'] = 100*df_test["created_yr"] + df_test["created_month"]

print("Number of rows in Train: {}".format(df_train.shape))
print("Number of rows in Test: {}".format(df_test.shape))

In [None]:
df_train.head()

In [None]:
df_train.columns

In [None]:
print("Number of class types")
np.unique(df_train['interest_level'])

In [None]:
# Starting with numerical features first
features_considered = ['bathrooms', 'bedrooms', 'latitude', 'longitude', 'price', 
                       'no_photos', 'no_features', 'created_yr', 'created_day', 'created_month', 
                       'created_month_YrMnth']

In [None]:
X = df_train[features_considered]
# Converting 'str' class labels to numeric labels
y = df_train['interest_level'].astype('category').cat.codes
print("Converted Labels: {}".format(np.unique(y)))

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, shuffle=True, stratify=y, test_size=0.30, random_state=42)

In [None]:
import xgboost as xgb
gbm = xgb.XGBClassifier(max_depth=8, n_estimators=500, learning_rate=0.1, n_jobs=-1).fit(X_train, y_train)

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=10, class_weight="balanced", oob_score=True, random_state=1)
clf.fit(X_train, y_train)

In [None]:
y_hat_train = gbm.predict(X_train)
y_hat_val = gbm.predict(X_val)

In [None]:
%matplotlib inline
from skater.core.visualizer import decision_boundary as db

_, _ = db.plot_decision_boundary(gbm, X0=X_train.iloc[:, 2], X1=X_train.iloc[:, 5], Y=y_train, width=10,
                          height=10, static_color_map=['deeppink', 'darkturquoise', 'maroon'])

In [None]:
f, p = db.plot_decision_boundary(gbm, X0=X_train.iloc[:, 2], X1=X_train.iloc[:, 5], Y=y_train, width=10,
                          height=10, mode='interactive')

## Evaluation

In [None]:
from sklearn.metrics import classification_report

print("\n--------Train dataset classification report----------\n")
target_names = ['high', 'low', 'medium']
print(classification_report(y_train, y_hat_train, target_names=target_names))

print("\n--------Validation/Holdout dataset classification report----------\n")
print(classification_report(y_val, y_hat_val, target_names=target_names))

## Using Skater to understand decision policies

In [None]:
from skater.core.explanations import Interpretation
from skater.model import InMemoryModel
from skater.util.dataops import show_in_notebook
from skater.util.logger import _INFO


interpreter = Interpretation(X_train, feature_names=features_considered)
model_inst = InMemoryModel(gbm.predict, examples=X_train, model_type='classifier', unique_values=[0, 1, 2],
                           feature_names=features_considered, target_names=['0', '1', '2'], log_level=_INFO)

In [None]:
surrogate_explainer = interpreter.tree_surrogate(oracle=model_inst, class_weight="balanced", seed=5)
surrogate_explainer.fit(X_train, y_train, use_oracle=True, prune=None, scorer_type='default')

In [None]:
y_hat = surrogate_explainer.predict(X_val)
print("\n--------Validation/Holdout dataset classification report----------\n")
target_names = ['class 0', 'class 1', 'class 2']
print(classification_report(y_val, y_hat, target_names=target_names))

In [None]:
%matplotlib inline
surrogate_explainer.plot_global_decisions(colors=['lightsteelblue', 'darkkhaki', 'aquamarine'], 
                                          file_name='surrogate_tree_rental_no_prune.png', show_img=False)

In [None]:
#show_in_notebook('surrogate_tree_rental_no_prune.png', width=400, height=400)

In [None]:
# The interactiveness is not that impressive, more works needs to be done there. 
# This is just a temporary solution
show_in_notebook('surrogate_tree_rental_no_prune.png', width=900, height=400 , mode='interactive')

In [None]:
# params = {"criterion": ['gini', 'entropy'], "min_samples_leaf": [2, 4],
# "max_leaf_nodes": [2, 4, 6, 8, 10], "max_depth": [4, 6, 10, 14, 18]
# }

surrogate_explainer.fit(X_train, y_train, use_oracle=True, prune='pre', scorer_type='default')

In [None]:
surrogate_explainer.plot_global_decisions(colors=['lightsteelblue', 'darkkhaki', 'aquamarine'], 
                                          file_name='surrogate_tree_rental_pruned.png', show_img=False)

In [None]:
show_in_notebook('surrogate_tree_rental_pruned.png', width=400, height=400)

In [None]:
y_hat_val = surrogate_explainer.predict(X_val)
print("\n--------Validation/Holdout dataset classification report----------\n")
print(classification_report(y_val, y_hat_val, target_names=target_names))

In [None]:
surrogate_explainer2 = interpreter.tree_surrogate(oracle=model_inst, class_weight="balanced", seed=5)
surrogate_explainer2.fit(X_train, y_train, use_oracle=True, prune=None, scorer_type='default')

In [None]:
y_hat_val2 = surrogate_explainer2.predict(X_val)
print("\n--------Validation/Holdout dataset classification report----------\n")
print(classification_report(y_val, y_hat_val2, target_names=target_names))

In [None]:
surrogate_explainer2.fit(X_train, y_train, use_oracle=True, prune='post', scorer_type='default')

In [None]:
# The way plotting is done now is slow and not that interactive, this needs to be improved
surrogate_explainer2.plot_global_decisions(colors=['lightsteelblue', 'darkkhaki', 'aquamarine'], 
                                          file_name='surrogate_tree_rental_postpruned.png', show_img=False)

In [None]:
show_in_notebook('surrogate_tree_rental_postpruned.png', width=400, height=400)

In [None]:
# Visualization for large graphs needs to be fixed, but in the meantime
surrogate_explainer2.decisions_as_txt()

In [None]:
y_hat_val2 = surrogate_explainer2.predict(X_val)
print("\n--------Validation/Holdout dataset classification report----------\n")
print(classification_report(y_val, y_hat_val2, target_names=target_names))

#### Changing the scoring function to 'log-loss'

In [None]:
interpreter = Interpretation(X_train, feature_names=features_considered)
model_inst = InMemoryModel(gbm.predict_proba, examples=X_train, model_type='classifier',
                           feature_names=features_considered, target_names=['0', '1', '2'], log_level=_INFO)

In [None]:
surrogate_explainer3 = interpreter.tree_surrogate(oracle=model_inst, class_weight="balanced", seed=5)
surrogate_explainer3.fit(X_train, y_train, use_oracle=True, prune='post', scorer_type='cross_entropy')

In [None]:
y_hat_val3 = surrogate_explainer3.predict(X_val)
print("\n--------Validation/Holdout dataset classification report----------\n")
print(classification_report(y_val, y_hat_val3, target_names=target_names))

#### Let's see how a hierarchical Interpretable Tree based model does?

In [None]:
surrogate_explainer4 = interpreter.tree_surrogate(oracle=model_inst, class_weight="balanced", seed=5)
surrogate_explainer4.fit(X_train, y_train, use_oracle=False, prune='post', scorer_type='default')

In [None]:
y_hat_val4 = surrogate_explainer4.predict(X_val)
print("\n--------Validation/Holdout dataset classification report----------\n")
print(classification_report(y_val, y_hat_val4, target_names=target_names))

## Evaluating on supplied test dataset

In [None]:
# using base estimator
X_test = df_test[features_considered]
y_hat_test_base_model = gbm.predict_proba(X_test)
assert X_test.shape[0] == y_hat_test_base_model.shape[0]

y_hat_test_surrogate = surrogate_explainer.predict(X_test, prob_score=True)
assert X_test.shape[0] == y_hat_test_surrogate.shape[0]

# post + F1 score
y_hat_test_surrogate_post_f1 = surrogate_explainer2.predict(X_test, prob_score=True)
assert X_test.shape[0] == y_hat_test_surrogate_post_f1.shape[0]

# post + log-loss
y_hat_test_surrogate_post_ll = surrogate_explainer3.predict(X_test, prob_score=True)
assert X_test.shape[0] == y_hat_test_surrogate_post_ll.shape[0]

# not trained on the predictions of the base model
y_hat_test_surrogate_i = surrogate_explainer4.predict(X_test, prob_score=True)
assert X_test.shape[0] == y_hat_test_surrogate_i.shape[0]

In [None]:
## constructing the format needed for submission
list_id = df_test['listing_id'].tolist()
# base
predictions = pd.DataFrame(y_hat_test_base_model, columns=['high', 'low', 'medium'])
result_df_base = predictions
result_df_base.loc[:, 'listing_id'] = pd.Series(list_id, index=result_df_base.index)
# re-arrange the columns
result_df_base = result_df_base[['listing_id', 'high', 'low', 'medium']]

# Surrogate
predictions = pd.DataFrame(y_hat_test_surrogate, columns=['high', 'low', 'medium'])
result_df_surrogate = predictions
result_df_surrogate.loc[:, 'listing_id'] = pd.Series(list_id, index=result_df_surrogate.index)
# re-arrange the columns
result_df_surrogate = result_df_surrogate[['listing_id', 'high', 'low', 'medium']]

# post + F1 score
predictions = pd.DataFrame(y_hat_test_surrogate_post_f1, columns=['high', 'low', 'medium'])
result_df_surrogate_post_f1 = predictions
result_df_surrogate_post_f1.loc[:, 'listing_id'] = pd.Series(list_id, index=result_df_surrogate_post_f1.index)
# re-arrange the columns
result_df_surrogate_post_f1 = result_df_surrogate_post_f1[['listing_id', 'high', 'low', 'medium']]

# post + log-loss
predictions = pd.DataFrame(y_hat_test_surrogate_post_ll, columns=['high', 'low', 'medium'])
result_df_surrogate_post_ll = predictions
result_df_surrogate_post_ll.loc[:, 'listing_id'] = pd.Series(list_id, index=result_df_surrogate_post_ll.index)
# re-arrange the columns
result_df_surrogate_post_ll = result_df_surrogate_post_ll[['listing_id', 'high', 'low', 'medium']]


# a better interpretable tree using post-pruning
predictions = pd.DataFrame(y_hat_test_surrogate_i, columns=['high', 'low', 'medium'])
result_df_surrogate_i = predictions
result_df_surrogate_i.loc[:, 'listing_id'] = pd.Series(list_id, index=result_df_surrogate_i.index)
# re-arrange the columns
result_df_surrogate_i = result_df_surrogate_i[['listing_id', 'high', 'low', 'medium']]

In [None]:
# converting to csv
result_df_base.to_csv('submission_base.csv', index=False)
# Using surrogate models for predicting didn't give good result but nevertheless could possibly be used
# for explaining the decisions approximately
result_df_surrogate.to_csv('submission_surrogate.csv', index=False)
result_df_surrogate_post_f1.to_csv('submission_post_f1.csv', index=False)
result_df_surrogate_post_ll.to_csv('submission_post_ll.csv', index=False)

# interpretable model without using predictions from the base model(Oracle) didn't help either in this case
result_df_surrogate_i.to_csv('submission_i.csv', index=False)