In [1]:
import pandas as pd
import numpy as np
import os

from io import StringIO
from sklearn.model_selection import train_test_split

In [2]:
# sagemaker libraries
import boto3
import sagemaker
import mxnet as mx

from sagemaker import get_execution_role
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.predictor import csv_serializer

In [3]:
active = pd.read_csv('transformed_active.csv')
past = pd.read_csv('transformed_past.csv')

In [4]:
active.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,76,77,78,79,80,81,82,83,84,85
0,590000.0,-0.38399,2.188086,0.32575,1.335988,0.003251,-0.204843,1.428275,-1.401699,-0.134462,...,-0.000732,-0.101182,-0.661446,0.482604,-0.735396,0.547869,0.169401,-0.218324,-0.172415,0.231593
1,589000.0,1.369588,0.018237,0.948351,-3.202843,0.500272,1.857549,1.780308,-1.859527,0.61254,...,-0.000137,0.173409,-0.725904,1.569119,0.470643,0.631998,0.609956,-0.262356,-0.034488,-0.233402
2,664900.0,-3.9513,0.375523,-0.471736,-1.379998,0.326533,-1.055838,0.414315,-1.579257,-1.142416,...,-0.000833,-0.152648,-0.796099,0.477727,0.468923,-1.059683,-0.508418,0.011712,-0.571293,1.603781
3,435000.0,1.496055,2.919255,0.142734,2.609945,1.167647,-0.566106,0.914542,-1.527745,-0.710405,...,-0.000571,-0.042684,-0.706107,0.090662,-0.905591,1.083773,0.79915,-0.261384,0.114413,-0.288909
4,400000.0,0.509651,-0.767731,-0.320817,0.47061,-1.145476,-1.0156,-2.42836,-2.771265,-1.351204,...,-0.001295,0.171488,-0.318554,-0.353788,-0.666221,0.332001,-0.734822,0.00322,-0.228598,0.806828


In [5]:
past.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,76,77,78,79,80,81,82,83,84,85
0,570000.0,-0.818375,3.408697,0.240056,1.993104,-0.653233,-0.160618,-0.290284,0.754372,0.171967,...,-0.001048,-0.017664,-0.154569,0.07767,-0.91063,0.770455,0.173211,-0.117439,-0.122622,0.631282
1,535000.0,-0.862253,3.655905,0.327998,2.054754,-0.439575,-0.048575,-0.441619,0.680315,0.27854,...,-0.000946,-0.015303,-0.010034,-0.266452,0.371394,0.444565,0.313832,-0.077385,0.20981,-0.327195
2,525000.0,-0.856397,3.756488,0.116754,2.16332,-0.27785,-0.204776,-0.271301,0.712407,0.141604,...,-0.000958,-0.001718,-0.04933,-0.079142,-0.232725,0.663266,0.301571,-0.110841,0.084703,0.039595
3,560000.0,-1.436549,3.568422,0.423014,1.428297,-0.991835,0.07392,0.08076,0.785621,0.427419,...,-0.001061,-0.003625,-0.070065,0.168928,-0.327523,0.28634,-0.13564,-0.097357,0.132369,-0.685028
4,560000.0,-0.099081,3.60443,0.071807,2.280483,-0.136271,-0.033714,-0.143441,0.73984,-0.034038,...,-0.000931,0.032677,-0.093392,-0.036924,-0.684056,0.541267,0.165751,-0.104619,0.181542,-0.492672


## Prepare Data
Now that we have cleaned, standardized, and completed dimensionality reduction for our data, let's prep it for our model. We need to extract the price feature to use for our target vector (y). It is the first column of each DataFrame.

In [6]:
# Create target vectors
y_active = active.iloc[:,0]
y_past = past.iloc[:,0]

In [7]:
y_active.head()

0    590000.0
1    589000.0
2    664900.0
3    435000.0
4    400000.0
Name: 0, dtype: float64

In [8]:
y_past.shape

(836,)

In [9]:
# drop price column from X features
X_active = active.drop(active.columns[0], axis=1)
X_past = past.drop(past.columns[0], axis=1)

In [10]:
X_active.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,76,77,78,79,80,81,82,83,84,85
0,-0.38399,2.188086,0.32575,1.335988,0.003251,-0.204843,1.428275,-1.401699,-0.134462,-0.471004,...,-0.000732,-0.101182,-0.661446,0.482604,-0.735396,0.547869,0.169401,-0.218324,-0.172415,0.231593
1,1.369588,0.018237,0.948351,-3.202843,0.500272,1.857549,1.780308,-1.859527,0.61254,-0.252534,...,-0.000137,0.173409,-0.725904,1.569119,0.470643,0.631998,0.609956,-0.262356,-0.034488,-0.233402
2,-3.9513,0.375523,-0.471736,-1.379998,0.326533,-1.055838,0.414315,-1.579257,-1.142416,-0.238487,...,-0.000833,-0.152648,-0.796099,0.477727,0.468923,-1.059683,-0.508418,0.011712,-0.571293,1.603781
3,1.496055,2.919255,0.142734,2.609945,1.167647,-0.566106,0.914542,-1.527745,-0.710405,-0.293602,...,-0.000571,-0.042684,-0.706107,0.090662,-0.905591,1.083773,0.79915,-0.261384,0.114413,-0.288909
4,0.509651,-0.767731,-0.320817,0.47061,-1.145476,-1.0156,-2.42836,-2.771265,-1.351204,0.347469,...,-0.001295,0.171488,-0.318554,-0.353788,-0.666221,0.332001,-0.734822,0.00322,-0.228598,0.806828


In [11]:
X_past.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,76,77,78,79,80,81,82,83,84,85
0,-0.818375,3.408697,0.240056,1.993104,-0.653233,-0.160618,-0.290284,0.754372,0.171967,-0.003675,...,-0.001048,-0.017664,-0.154569,0.07767,-0.91063,0.770455,0.173211,-0.117439,-0.122622,0.631282
1,-0.862253,3.655905,0.327998,2.054754,-0.439575,-0.048575,-0.441619,0.680315,0.27854,-0.005924,...,-0.000946,-0.015303,-0.010034,-0.266452,0.371394,0.444565,0.313832,-0.077385,0.20981,-0.327195
2,-0.856397,3.756488,0.116754,2.16332,-0.27785,-0.204776,-0.271301,0.712407,0.141604,0.007767,...,-0.000958,-0.001718,-0.04933,-0.079142,-0.232725,0.663266,0.301571,-0.110841,0.084703,0.039595
3,-1.436549,3.568422,0.423014,1.428297,-0.991835,0.07392,0.08076,0.785621,0.427419,-0.090199,...,-0.001061,-0.003625,-0.070065,0.168928,-0.327523,0.28634,-0.13564,-0.097357,0.132369,-0.685028
4,-0.099081,3.60443,0.071807,2.280483,-0.136271,-0.033714,-0.143441,0.73984,-0.034038,-0.003269,...,-0.000931,0.032677,-0.093392,-0.036924,-0.684056,0.541267,0.165751,-0.104619,0.181542,-0.492672


In [12]:
# Split past sales into training and validation samples
X_train, X_test, y_train, y_test = train_test_split(X_past, y_past, test_size=0.2, train_size=0.8)

In [13]:
print("X_train shape: ", X_train.shape)
print("y_train shape: ", y_train.shape)

print("X_test shape: ", X_test.shape)
print("y_test shape: ", y_test.shape)

X_train shape:  (668, 85)
y_train shape:  (668,)
X_test shape:  (168, 85)
y_test shape:  (168,)


In [14]:
# Split training data into training and validation samples
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.3, train_size=0.7)

In [15]:
print("X_train shape: ", X_train.shape)
print("y_train shape: ", y_train.shape)

print("X_val shape: ", X_val.shape)
print("y_val shape: ", y_val.shape)

X_train shape:  (467, 85)
y_train shape:  (467,)
X_val shape:  (201, 85)
y_val shape:  (201,)


## Export Training Data to S3

In [16]:
# SageMaker session and role
session = sagemaker.Session()
role = sagemaker.get_execution_role()

# S3 bucket name
bucket = session.default_bucket()

In [17]:
# specify an output path
prefix = 'listings'
output_path = 's3://{}/{}'.format(bucket, prefix)

In [18]:
data_dir = '../data/listings'
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

In [19]:
# Store testing samples in S3 for later use
X_test.to_csv(os.path.join(data_dir, 'test.csv'), header=False, index=False)

In [20]:
# Store training samples in S3, making sure to concatenate y label as first column
pd.concat([y_train, X_train], axis=1).to_csv(os.path.join(data_dir, 'train.csv'), header=False, index=False)

In [21]:
# Store validation samples in S3, making sure to concatenate y label as first column
pd.concat([y_val, X_val], axis=1).to_csv(os.path.join(data_dir, 'validation.csv'), header=False, index=False)

In [22]:
# Upload to S3
prefix = 'listings'

train_location = session.upload_data(os.path.join(data_dir, 'train.csv'), key_prefix=prefix)
val_location = session.upload_data(os.path.join(data_dir, 'validation.csv'), key_prefix=prefix)
test_location = session.upload_data(os.path.join(data_dir, 'test.csv'), key_prefix=prefix)

## Define Model
We will use SageMaker's built in XGBoost model.

In [23]:
# Set region, container, and output_path
region_name = 'us-east-2'

container = get_image_uri(boto3.Session().region_name,
                          'xgboost', 
                          repo_version = '0.90-1'); 

output_path = 's3://{}/{}/output'.format(session.default_bucket(), prefix)

In [24]:
# Construct estimator object
xgb = sagemaker.estimator.Estimator(container,
                                      role = role, 
                                      train_instance_count = 1, 
                                      train_instance_type = 'ml.c4.xlarge', 
                                      output_path = output_path, 
                                      sagemaker_session = session);

## Hyperparameter Tuning
Let's see if we can improve the Mean Absolute Percentage Error (rmse) for the validation samples by changing some of the hyperparameters our model is using for training. Each cell below will be used to alter a single hyperparameter.

In [33]:
# Define hyperparameters for this model
# Successful values that I have used previously will be our baseline
xgb.set_hyperparameters(max_depth=5,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        objective='reg:linear',
                        early_stopping_rounds=10,
                        rate_drop=0.3,
                        num_round=200)

In [34]:
from sagemaker.tuner import IntegerParameter, ContinuousParameter, HyperparameterTuner

xgb_hyperparameter_tuner = HyperparameterTuner( estimator = xgb,
                                                objective_metric_name = 'validation:mae',
                                                objective_type = 'Minimize',
                                                max_jobs = 20, 
                                                max_parallel_jobs = 3, 
                                                hyperparameter_ranges = {
                                                  'eta'      : ContinuousParameter(0.0, 0.5),
                                                  'lambda'   : ContinuousParameter(0, 1000),
                                                  'max_depth': IntegerParameter(5, 17),
                                                  'num_round': IntegerParameter(100, 500),
                                                  'min_child_weight': IntegerParameter(1, 10),
                                                })

## Train the Model
We will use the built-in "fit" function with the model we defined above, and access the training and validation data we have stored in S3.

In [35]:
# This is a wrapper around the location of our train and validation data
# This makes sure SageMaker knows our data is in csv format
s3_input_train = sagemaker.s3_input(s3_data=train_location, content_type='csv')
s3_input_validation = sagemaker.s3_input(s3_data=val_location, content_type='csv')

xgb_hyperparameter_tuner.fit({'train': s3_input_train, 'validation': s3_input_validation})

In [36]:
xgb_hyperparameter_tuner.wait()

.....................................................................................................................................................................................................................................................!


In [37]:
xgb_attached = sagemaker.estimator.Estimator.attach(xgb_hyperparameter_tuner.best_training_job())

2019-09-06 20:29:26 Starting - Preparing the instances for training
2019-09-06 20:29:26 Downloading - Downloading input data
2019-09-06 20:29:26 Training - Training image download completed. Training in progress.
2019-09-06 20:29:26 Uploading - Uploading generated training model
2019-09-06 20:29:26 Completed - Training job completed[31m2019-09-06 20:29:16,089 sagemaker-containers INFO     Imported framework sagemaker_xgboost_container.training[0m
[31m2019-09-06 20:29:16,090 sagemaker-containers INFO     Failed to parse hyperparameter objective value reg:linear to Json.[0m
[31mReturning the value itself[0m
[31m2019-09-06 20:29:16,090 sagemaker-containers INFO     Failed to parse hyperparameter _tuning_objective_metric value validation:mae to Json.[0m
[31mReturning the value itself[0m
[31m2019-09-06 20:29:16,094 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[31m2019-09-06 20:29:16,110 sagemaker_xgboost_container.training INFO     Running XGB

The response for our best training round:
{'_tuning_objective_metric': 'validation:mae', 'max_depth': 11, 'subsample': 0.8, 'num_round': 421, 'gamma': 4.0, 'eval_metric': ['mae'], 'early_stopping_rounds': 10, 'rate_drop': 0.3, 'lambda': 474.24314972169725, 'objective': 'reg:linear', 'eta': 0.373620175273373, 'min_child_weight': 10.0}

This shows us that the best Mean Absolute Error (MAE) we were able to achieve during training was 35162, which converted to Mean Absolute Percentage Error (MAPE) can be found by dividing this MAE value by the mean sale price of this validation sample set.

In [39]:
y_val.mean()

506853.7960199005

In [43]:
top_mae = 35162.7
mape = top_mae / y_val.mean()
print(mape)

0.06937444343145338


That's under 7% but let's try changing the drop rate to see if we can achieve an even lower MAPE:

In [68]:
# Define hyperparameters for this model
# rate_drop has been updated to 0.5 from 0.3
xgb.set_hyperparameters(max_depth=5,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        objective='reg:linear',
                        early_stopping_rounds=10,
                        rate_drop=0.5,
                        num_round=200)

In [69]:
# The same tuning parameters are being used,
# but we are re-assigning the xgb reference to 'estimator' now that it has been updated:
xgb_hyperparameter_tuner = HyperparameterTuner( estimator = xgb,
                                                objective_metric_name = 'validation:mae',
                                                objective_type = 'Minimize',
                                                max_jobs = 20, 
                                                max_parallel_jobs = 3, 
                                                hyperparameter_ranges = {
                                                  'eta'      : ContinuousParameter(0.0, 0.5),
                                                  'lambda'   : ContinuousParameter(0, 1000),
                                                  'max_depth': IntegerParameter(5, 17),
                                                  'num_round': IntegerParameter(100, 500),
                                                  'min_child_weight': IntegerParameter(1, 10),
                                                })

In [46]:
xgb_hyperparameter_tuner.fit({'train': s3_input_train, 'validation': s3_input_validation})

In [47]:
xgb_hyperparameter_tuner.wait()

....................................................................................................................................................................................................................................!


In [48]:
xgb_attached2 = sagemaker.estimator.Estimator.attach(xgb_hyperparameter_tuner.best_training_job())

2019-09-06 21:43:58 Starting - Preparing the instances for training
2019-09-06 21:43:58 Downloading - Downloading input data
2019-09-06 21:43:58 Training - Training image download completed. Training in progress.
2019-09-06 21:43:58 Uploading - Uploading generated training model
2019-09-06 21:43:58 Completed - Training job completed[31m2019-09-06 21:43:47,197 sagemaker-containers INFO     Imported framework sagemaker_xgboost_container.training[0m
[31m2019-09-06 21:43:47,198 sagemaker-containers INFO     Failed to parse hyperparameter objective value reg:linear to Json.[0m
[31mReturning the value itself[0m
[31m2019-09-06 21:43:47,198 sagemaker-containers INFO     Failed to parse hyperparameter _tuning_objective_metric value validation:mae to Json.[0m
[31mReturning the value itself[0m
[31m2019-09-06 21:43:47,202 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[31m2019-09-06 21:43:47,214 sagemaker_xgboost_container.training INFO     Running XGB

In [53]:
top_mae = 33669.5
mape = top_mae / y_val.mean()
print(mape)

0.06642842623334727


This drop out rate performed a bit better overall, let's see what values were selected for the other parameters as this job's "best" round of training:
{'subsample': 0.8, 'early_stopping_rounds': 10, 'eval_metric': ['mae'], 'eta': 0.4539415243497953, 'gamma': 4.0, 'max_depth': 14, '_tuning_objective_metric': 'validation:mae', 'objective': 'reg:linear', 'min_child_weight': 10.0, 'lambda': 115.80246104977371, 'rate_drop': 0.5, 'num_round': 363}

In [49]:
# Define hyperparameters for this model
# rate_drop has been updated to 0.2 from 0.5
xgb.set_hyperparameters(max_depth=5,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        objective='reg:linear',
                        early_stopping_rounds=10,
                        rate_drop=0.2,
                        num_round=200)

In [50]:
# The same tuning parameters are being used,
# but we are re-assigning the xgb reference to 'estimator' now that it has been updated:
xgb_hyperparameter_tuner = HyperparameterTuner( estimator = xgb,
                                                objective_metric_name = 'validation:mae',
                                                objective_type = 'Minimize',
                                                max_jobs = 20, 
                                                max_parallel_jobs = 3, 
                                                hyperparameter_ranges = {
                                                  'eta'      : ContinuousParameter(0.0, 0.5),
                                                  'lambda'   : ContinuousParameter(0, 1000),
                                                  'max_depth': IntegerParameter(5, 17),
                                                  'num_round': IntegerParameter(100, 500),
                                                  'min_child_weight': IntegerParameter(1, 10),
                                                })

In [51]:
xgb_hyperparameter_tuner.fit({'train': s3_input_train, 'validation': s3_input_validation})

In [52]:
xgb_hyperparameter_tuner.wait()

......................................................................................................................................................................................................................................................!


In [54]:
xgb_attached3 = sagemaker.estimator.Estimator.attach(xgb_hyperparameter_tuner.best_training_job())

2019-09-06 22:06:45 Starting - Preparing the instances for training
2019-09-06 22:06:45 Downloading - Downloading input data
2019-09-06 22:06:45 Training - Training image download completed. Training in progress.
2019-09-06 22:06:45 Uploading - Uploading generated training model
2019-09-06 22:06:45 Completed - Training job completed[31m2019-09-06 22:06:34,840 sagemaker-containers INFO     Imported framework sagemaker_xgboost_container.training[0m
[31m2019-09-06 22:06:34,841 sagemaker-containers INFO     Failed to parse hyperparameter _tuning_objective_metric value validation:mae to Json.[0m
[31mReturning the value itself[0m
[31m2019-09-06 22:06:34,841 sagemaker-containers INFO     Failed to parse hyperparameter objective value reg:linear to Json.[0m
[31mReturning the value itself[0m
[31m2019-09-06 22:06:34,844 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[31m2019-09-06 22:06:34,856 sagemaker_xgboost_container.training INFO     Running XGB

In [55]:
top_mae = 34834.2
mape = top_mae / y_val.mean()
print(mape)

0.06872632753969216


The parameters for this job's best round were: {'eval_metric': ['mae'], 'num_round': 437, '_tuning_objective_metric': 'validation:mae', 'early_stopping_rounds': 10, 'subsample': 0.8, 'eta': 0.43166340798965475, 'lambda': 682.9055918627323, 'gamma': 4.0, 'max_depth': 5, 'rate_drop': 0.2, 'objective': 'reg:linear', 'min_child_weight': 8.0}

After inspecting the parameters for the "best" training rounds with dropout rates of 0.2, 0.3, and 0.5, I am interested in increasing the early stopping rounds and checking the MAPE for a dropout rate of 0.4. While we are trying to minimize MAPE for this validation set, our ultimate goal is best generalize these findings, in order to minimize MAPE for new samples to have as accurate of a model as possible.

In [56]:
# Define hyperparameters for this model
# rate_drop has been updated to 0.4 from 0.2
# early stopping rounds have also been increased to 30
xgb.set_hyperparameters(max_depth=5,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        objective='reg:linear',
                        early_stopping_rounds=30,
                        rate_drop=0.4,
                        num_round=200)

In [57]:
# The same tuning parameters are being used,
# but we are re-assigning the xgb reference to 'estimator' now that it has been updated:
xgb_hyperparameter_tuner = HyperparameterTuner( estimator = xgb,
                                                objective_metric_name = 'validation:mae',
                                                objective_type = 'Minimize',
                                                max_jobs = 20, 
                                                max_parallel_jobs = 3, 
                                                hyperparameter_ranges = {
                                                  'eta'      : ContinuousParameter(0.0, 0.5),
                                                  'lambda'   : ContinuousParameter(0, 1000),
                                                  'max_depth': IntegerParameter(5, 17),
                                                  'num_round': IntegerParameter(100, 500),
                                                  'min_child_weight': IntegerParameter(1, 10),
                                                })

In [58]:
xgb_hyperparameter_tuner.fit({'train': s3_input_train, 'validation': s3_input_validation})

In [59]:
xgb_hyperparameter_tuner.wait()

................................................................................................................................................................................................................................!


In [60]:
xgb_attached4 = sagemaker.estimator.Estimator.attach(xgb_hyperparameter_tuner.best_training_job())

2019-09-06 22:28:53 Starting - Preparing the instances for training
2019-09-06 22:28:53 Downloading - Downloading input data
2019-09-06 22:28:53 Training - Training image download completed. Training in progress.
2019-09-06 22:28:53 Uploading - Uploading generated training model
2019-09-06 22:28:53 Completed - Training job completed[31m2019-09-06 22:28:43,213 sagemaker-containers INFO     Imported framework sagemaker_xgboost_container.training[0m
[31m2019-09-06 22:28:43,214 sagemaker-containers INFO     Failed to parse hyperparameter objective value reg:linear to Json.[0m
[31mReturning the value itself[0m
[31m2019-09-06 22:28:43,214 sagemaker-containers INFO     Failed to parse hyperparameter _tuning_objective_metric value validation:mae to Json.[0m
[31mReturning the value itself[0m
[31m2019-09-06 22:28:43,217 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[31m2019-09-06 22:28:43,229 sagemaker_xgboost_container.training INFO     Running XGB

In [61]:
top_mae = 35049.7
mape = top_mae / y_val.mean()
print(mape)

0.06915149945650964


Alright, it is looking like 0.5 is our ideal drop rate, but let's try 0.6 just to be sure.

In [62]:
# Define hyperparameters for this model
# rate_drop has been updated to 0.6 from 0.4
xgb.set_hyperparameters(max_depth=5,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        objective='reg:linear',
                        early_stopping_rounds=30,
                        rate_drop=0.6,
                        num_round=200)

In [63]:
# The same tuning parameters are being used,
# but we are re-assigning the xgb reference to 'estimator' now that it has been updated:
xgb_hyperparameter_tuner = HyperparameterTuner( estimator = xgb,
                                                objective_metric_name = 'validation:mae',
                                                objective_type = 'Minimize',
                                                max_jobs = 20, 
                                                max_parallel_jobs = 3, 
                                                hyperparameter_ranges = {
                                                  'eta'      : ContinuousParameter(0.0, 0.5),
                                                  'lambda'   : ContinuousParameter(0, 1000),
                                                  'max_depth': IntegerParameter(5, 17),
                                                  'num_round': IntegerParameter(100, 500),
                                                  'min_child_weight': IntegerParameter(1, 10),
                                                })

In [64]:
xgb_hyperparameter_tuner.fit({'train': s3_input_train, 'validation': s3_input_validation})

In [65]:
xgb_hyperparameter_tuner.wait()

...........................................................................................................................................................................................................................................!


In [66]:
xgb_attached5 = sagemaker.estimator.Estimator.attach(xgb_hyperparameter_tuner.best_training_job())

2019-09-06 22:54:52 Starting - Preparing the instances for training
2019-09-06 22:54:52 Downloading - Downloading input data
2019-09-06 22:54:52 Training - Training image download completed. Training in progress.
2019-09-06 22:54:52 Uploading - Uploading generated training model
2019-09-06 22:54:52 Completed - Training job completed[31m2019-09-06 22:54:23,730 sagemaker-containers INFO     Imported framework sagemaker_xgboost_container.training[0m
[31m2019-09-06 22:54:23,731 sagemaker-containers INFO     Failed to parse hyperparameter _tuning_objective_metric value validation:mae to Json.[0m
[31mReturning the value itself[0m
[31m2019-09-06 22:54:23,731 sagemaker-containers INFO     Failed to parse hyperparameter objective value reg:linear to Json.[0m
[31mReturning the value itself[0m
[31m2019-09-06 22:54:23,734 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[31m2019-09-06 22:54:23,746 sagemaker_xgboost_container.training INFO     Running XGB

In [67]:
top_mae = 35078.9
mape = top_mae / y_val.mean()
print(mape)

0.06920910975799954


Great! It looks like we have found our optimal hyperparameters with a dropout rate of 0.5:
{'subsample': 0.8, 'early_stopping_rounds': 10, 'eval_metric': ['mae'], 'eta': 0.4539415243497953, 'gamma': 4.0, 'max_depth': 14, '_tuning_objective_metric': 'validation:mae', 'objective': 'reg:linear', 'min_child_weight': 10.0, 'lambda': 115.80246104977371, 'rate_drop': 0.5, 'num_round': 363}

In [84]:
# Define hyperparameters for this model
# rate_drop has been updated to 0.5 from 0.6
xgb.set_hyperparameters(max_depth=14,
                        eta=0.45394,
                        gamma=4,
                        min_child_weight=10,
                        subsample=0.8,
                        early_stopping_rounds=30,
                        rate_drop=0.5,
                        num_round=400)

In [85]:
# The same tuning parameters are being used,
# but we are re-assigning the xgb reference to 'estimator' now that it has been updated:
xgb_hyperparameter_tuner = HyperparameterTuner( estimator = xgb,
                                                objective_metric_name = 'validation:mae',
                                                objective_type = 'Minimize',
                                                max_jobs = 10, 
                                                max_parallel_jobs = 3, 
                                                hyperparameter_ranges = {
                                                  'lambda'   : ContinuousParameter(100, 200),
                                                })

In [86]:
xgb_hyperparameter_tuner.fit({'train': s3_input_train, 'validation': s3_input_validation})

In [87]:
xgb_hyperparameter_tuner.wait()

...............................................................................................................................!


In [88]:
optimized_xgb = sagemaker.estimator.Estimator.attach(xgb_hyperparameter_tuner.best_training_job())

2019-09-06 23:58:20 Starting - Preparing the instances for training
2019-09-06 23:58:20 Downloading - Downloading input data
2019-09-06 23:58:20 Training - Training image download completed. Training in progress.
2019-09-06 23:58:20 Uploading - Uploading generated training model
2019-09-06 23:58:20 Completed - Training job completed[31m2019-09-06 23:58:10,417 sagemaker-containers INFO     Imported framework sagemaker_xgboost_container.training[0m
[31m2019-09-06 23:58:10,417 sagemaker-containers INFO     Failed to parse hyperparameter objective value reg:linear to Json.[0m
[31mReturning the value itself[0m
[31m2019-09-06 23:58:10,418 sagemaker-containers INFO     Failed to parse hyperparameter _tuning_objective_metric value validation:mae to Json.[0m
[31mReturning the value itself[0m
[31m2019-09-06 23:58:10,421 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[31m2019-09-06 23:58:10,433 sagemaker_xgboost_container.training INFO     Running XGB

Training seconds: 46
Billable seconds: 46


In [89]:
top_mae = 33775.5
mape = top_mae / y_val.mean()
print(mape)

0.06663755951957767


Excellent. Now that we have found our optimal hyperparameters and trained our model, we are ready to use it to predict the sale price of our active listings.

## Test the Model
We will use SageMaker's Batch Transform functionality to test our model, so first we will define a transformer object using our optimized model from above.

In [90]:
xgb_transformer = optimized_xgb.transformer(instance_count=1, instance_type='ml.m4.xlarge')

In [91]:
# Earlier we saved X_test to S3 and stored that path as 'test_location'
xgb_transformer.transform(test_location, content_type='text/csv', split_type='Line')
xgb_transformer.wait()

....................................!


In [92]:
# Output is saved to S3, so we will need to load it into this notebook from S3
!aws s3 cp --recursive $xgb_transformer.output_path $data_dir

# Predicted values for the test set will be saved to Pandas Series object 'y_pred'
y_pred = pd.read_csv(os.path.join(data_dir, 'test.csv.out'), header=None)

Completed 1.9 KiB/1.9 KiB (29.0 KiB/s) with 1 file(s) remainingdownload: s3://sagemaker-us-east-2-359641297910/sagemaker-xgboost-190906-2349-009-77266-2019-09-07-00-01-14-086/test.csv.out to ../data/listings/test.csv.out


In [97]:
# def mape(y_true, y_pred):
#     y_true, y_pred = np.array(y_true), np.array(y_pred)
#     return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [101]:
# Calculate MAPE of model's predictions of unseen samples
test_mae = mean_absolute_error(y_test, y_pred)
print(test_mae)

32400.27176339286


This value seems like a pretty good fit since \\$32,400 is less than 6 percent of the median sale price of \\$546,300 in the region. So now that we have tested our model, and the MAE score is satisfactory, we can use this model to predict the actual potential sales price of active listings.

## Use the Model for Predicting Value of Active Listings