In [13]:
import pandas as pd
import numpy as np
import google.cloud.bigquery as bq
import tensorflow as tf
from tensorflow import keras
from sklearn import model_selection

### Ingest Data

In [2]:
%load_ext google.cloud.bigquery

In [3]:
%%bigquery data
SELECT
  *
FROM `curious-bot.homes.ad_details`
WHERE price_per_sq_m IS NOT NULL
  AND post_code IS NOT NULL

In [4]:
data.set_index('id', inplace=True)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7544 entries, 85f3fd3a-b6e6-478c-9daf-2bf67db03572 to 2e3cc25c-5b8f-49f5-a7e1-b4670523346b
Data columns (total 16 columns):
new_building      7544 non-null bool
promoted          7544 non-null bool
price             7544 non-null int64
area              7544 non-null int64
price_per_sq_m    7544 non-null int64
type              7544 non-null object
viewed            7544 non-null datetime64[ns, UTC]
address           7544 non-null object
post_code         7544 non-null object
street_num        7532 non-null object
street_name       7540 non-null object
postal_town       7544 non-null object
country           7544 non-null object
lat               7544 non-null float64
lng               7544 non-null float64
short_desc        7544 non-null object
dtypes: bool(2), datetime64[ns, UTC](1), float64(2), int64(3), object(8)
memory usage: 898.8+ KB


### Define Utils

In [6]:
def demo(dataset, feature_col):
    example = next(iter(dataset))[0]
    feature_layer = tf.keras.layers.DenseFeatures(feature_col)
    print(feature_layer(example).numpy())

def input_to_estimator(X, y, batch_size):
    ds = tf.data.Dataset.from_tensor_slices((dict(X), y))
    return ds.batch(batch_size)

def mean_absolute_prc_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true))

def get_metrics(y_true, y_pred, model_name):
    mae = metrics.mean_absolute_error(y_test, y_pred)
    mape = mean_absolute_prc_error(y_test, y_pred)
    r2 = metrics.r2_score(y_test, y_pred)
    return {'model': model_name, 'mae': mae, 'mape': mape, 'r2': r2}

### Hyperparameters

In [11]:
random_seed = 42
test_size = 0.3
batch_size = 300
area_bins = list(range(30, 180, 30))

### Prepare Train And Test Datasets

In [14]:
X_cols = ['new_building', 'promoted', 'area', 'type', 'post_code']

X = data[X_cols]
y = data.price_per_sq_m

data_train, data_test = model_selection.train_test_split(data,
                                                         test_size=test_size,
                                                         random_state=random_seed)

X_train, X_test = data_train[X_cols], data_test[X_cols]
y_train, y_test = data_train.price_per_sq_m, data_test.price_per_sq_m

ds_train = input_to_estimator(X_train, y_train, batch_size)
ds_test = input_to_estimator(X_test, y_test, batch_size)