In [106]:
import pandas as pd
import numpy as np
import google.cloud.bigquery as bq
import tensorflow as tf
from tensorflow import keras
from sklearn import model_selection
from sklearn import metrics
from sklearn import impute
from sklearn import pipeline
from sklearn import preprocessing
from sklearn import compose
from sklearn import linear_model
from sklearn import ensemble
from category_encoders import target_encoder

### Ingest Data

In [2]:
%load_ext google.cloud.bigquery

In [3]:
%%bigquery data
SELECT
  *
FROM `curious-bot.homes.ad_details`
WHERE price_per_sq_m IS NOT NULL
  AND post_code IS NOT NULL

In [4]:
data.set_index('id', inplace=True)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7544 entries, 5071b23d-0c43-4021-b5a5-91c34b612038 to a7b43aee-e5bc-42e6-abe2-8ab317a66c60
Data columns (total 16 columns):
new_building      7544 non-null bool
promoted          7544 non-null bool
price             7544 non-null int64
area              7544 non-null int64
price_per_sq_m    7544 non-null int64
type              7544 non-null object
viewed            7544 non-null datetime64[ns, UTC]
address           7544 non-null object
post_code         7544 non-null object
street_num        7532 non-null object
street_name       7540 non-null object
postal_town       7544 non-null object
country           7544 non-null object
lat               7544 non-null float64
lng               7544 non-null float64
short_desc        7544 non-null object
dtypes: bool(2), datetime64[ns, UTC](1), float64(2), int64(3), object(8)
memory usage: 898.8+ KB


### Define Utils

In [111]:
def demo(dataset, feature_col):
    example = next(iter(dataset))[0]
    feature_layer = tf.keras.layers.DenseFeatures(feature_col)
    print(feature_layer(example).numpy())

def input_to_estimator(X, y, batch_size):
    ds = tf.data.Dataset.from_tensor_slices((dict(X), y))
    return ds.batch(batch_size)

def mean_absolute_prc_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true))

def get_metrics(y_true, y_pred, model_name):
    mae = metrics.mean_absolute_error(y_test, y_pred)
    mape = mean_absolute_prc_error(y_test, y_pred)
    r2 = metrics.r2_score(y_test, y_pred)
    return {'model': model_name, 'mae': mae, 'mape': mape, 'r2': r2}

### Hyperparameters

In [112]:
random_seed = 42
test_size = 0.3
batch_size = 300
area_bins = list(range(30, 180, 30))
n_area_bins = 5
min_points_per_post_code = 20

### Prepare Train And Test Datasets

In [113]:
X_cols = ['new_building', 'promoted', 'area', 'type', 'post_code']

X = data[X_cols]
y = data.price_per_sq_m

data_train, data_test = model_selection.train_test_split(data,
                                                         test_size=test_size,
                                                         random_state=random_seed)

X_train, X_test = data_train[X_cols], data_test[X_cols]
y_train, y_test = data_train.price_per_sq_m, data_test.price_per_sq_m

ds_train = input_to_estimator(X_train, y_train, batch_size)
ds_test = input_to_estimator(X_test, y_test, batch_size)

### Benchmark Median Price Per Square Meter

In [114]:
metrics_list = []
style_format = {'mae': '{:.0f}', 'mape': '{:.0%}', 'r2': '{:.0%}'}

In [115]:
y_pred = [y_train.median()] * len(y_test)
metrics_list.append(get_metrics(y_test, y_pred, 'benchmark_median_price'))

In [116]:
pd.DataFrame(metrics_list).set_index('model').style.format(style_format)

Unnamed: 0_level_0,mae,mape,r2
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
benchmark_median_price,16670,26%,-0%


### Benchmark Median Price Per Square Meter Per Post Code

In [117]:
med_price_post_code = data_train.groupby('post_code').price_per_sq_m.median()
count_post_code = data_train.groupby('post_code').price_per_sq_m.count()

In [118]:
post_codes_above_min_points = count_post_code[count_post_code > min_points_per_post_code].index

In [119]:
med_price_post_code_sel = med_price_post_code.loc[post_codes_above_min_points]
med_price_post_code_sel.rename('post_code_median_price', inplace=True)

y_pred = data_test.merge(med_price_post_code_sel,
                         how='left',
                         left_on='post_code',
                         right_index=True).post_code_median_price

y_pred.fillna(y_train.median(), inplace=True)

In [120]:
metrics_list.append(get_metrics(y_test, y_pred, 'benchmark_median_price_per_post_code'))

In [121]:
pd.DataFrame(metrics_list).set_index('model').style.format(style_format)

Unnamed: 0_level_0,mae,mape,r2
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
benchmark_median_price,16670,26%,-0%
benchmark_median_price_per_post_code,15182,24%,15%


### Linear Regression

In [122]:
cat_transformer = pipeline.Pipeline(steps=[
    ('imputer', impute.SimpleImputer(strategy='constant', fill_value='unknown')),
    ('target_enc', target_encoder.TargetEncoder(handle_unknown='ignore'))
])

num_transformer = pipeline.Pipeline(steps=[
    ('imputer', impute.SimpleImputer(strategy='median')),
    ('bucketize', preprocessing.KBinsDiscretizer(n_bins=n_area_bins))
])

preprocessor = compose.ColumnTransformer(transformers=[
    ('cat_transf', cat_transformer, ['post_code']),
    ('num_transf', num_transformer, ['area'])
])

linreg = pipeline.Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('linear_reg', linear_model.LinearRegression())
])

In [123]:
linreg.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('cat_transf',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value='unknown',
                                                                                 missing_values=nan,
                                                                                 strategy='constant',
                                               

In [124]:
y_pred = linreg.predict(X_test)
metrics_list.append(get_metrics(y_test, y_pred, 'linear_regression'))

In [125]:
pd.DataFrame(metrics_list).set_index('model').style.format(style_format)

Unnamed: 0_level_0,mae,mape,r2
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
benchmark_median_price,16670,26%,-0%
benchmark_median_price_per_post_code,15182,24%,15%
linear_regression,13638,20%,29%


### Gradient Boosting Regressor

In [126]:
boostreg = pipeline.Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('linear_reg', ensemble.GradientBoostingRegressor())
])

In [127]:
boostreg.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('cat_transf',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value='unknown',
                                                                                 missing_values=nan,
                                                                                 strategy='constant',
                                               

In [128]:
y_pred = boostreg.predict(X_test)
metrics_list.append(get_metrics(y_test, y_pred, 'gradient_boosting_regressor'))

In [129]:
pd.DataFrame(metrics_list).set_index('model').style.format(style_format)

Unnamed: 0_level_0,mae,mape,r2
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
benchmark_median_price,16670,26%,-0%
benchmark_median_price_per_post_code,15182,24%,15%
linear_regression,13638,20%,29%
gradient_boosting_regressor,11120,16%,39%
