In [126]:
import pandas as pd
import numpy as np
import os
from io import StringIO

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import accuracy_score

In [127]:
# sagemaker libraries
import boto3
import sagemaker
import mxnet as mx

from sagemaker import get_execution_role
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.predictor import csv_serializer

In [128]:
active = pd.read_csv('transformed_active.csv')
past = pd.read_csv('transformed_past.csv')

In [129]:
active.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,76,77,78,79,80,81,82,83,84,85
0,590000.0,-0.38399,2.188086,0.32575,1.335988,0.003251,-0.204843,1.428275,-1.401699,-0.134462,...,-0.000732,-0.101182,-0.661446,0.482604,-0.735396,0.547869,0.169401,-0.218324,-0.172415,0.231593
1,589000.0,1.369588,0.018237,0.948351,-3.202843,0.500272,1.857549,1.780308,-1.859527,0.61254,...,-0.000137,0.173409,-0.725904,1.569119,0.470643,0.631998,0.609956,-0.262356,-0.034488,-0.233402
2,664900.0,-3.9513,0.375523,-0.471736,-1.379998,0.326533,-1.055838,0.414315,-1.579257,-1.142416,...,-0.000833,-0.152648,-0.796099,0.477727,0.468923,-1.059683,-0.508418,0.011712,-0.571293,1.603781
3,435000.0,1.496055,2.919255,0.142734,2.609945,1.167647,-0.566106,0.914542,-1.527745,-0.710405,...,-0.000571,-0.042684,-0.706107,0.090662,-0.905591,1.083773,0.79915,-0.261384,0.114413,-0.288909
4,400000.0,0.509651,-0.767731,-0.320817,0.47061,-1.145476,-1.0156,-2.42836,-2.771265,-1.351204,...,-0.001295,0.171488,-0.318554,-0.353788,-0.666221,0.332001,-0.734822,0.00322,-0.228598,0.806828


In [130]:
past.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,76,77,78,79,80,81,82,83,84,85
0,570000.0,-0.818375,3.408697,0.240056,1.993104,-0.653233,-0.160618,-0.290284,0.754372,0.171967,...,-0.001048,-0.017664,-0.154569,0.07767,-0.91063,0.770455,0.173211,-0.117439,-0.122622,0.631282
1,535000.0,-0.862253,3.655905,0.327998,2.054754,-0.439575,-0.048575,-0.441619,0.680315,0.27854,...,-0.000946,-0.015303,-0.010034,-0.266452,0.371394,0.444565,0.313832,-0.077385,0.20981,-0.327195
2,525000.0,-0.856397,3.756488,0.116754,2.16332,-0.27785,-0.204776,-0.271301,0.712407,0.141604,...,-0.000958,-0.001718,-0.04933,-0.079142,-0.232725,0.663266,0.301571,-0.110841,0.084703,0.039595
3,560000.0,-1.436549,3.568422,0.423014,1.428297,-0.991835,0.07392,0.08076,0.785621,0.427419,...,-0.001061,-0.003625,-0.070065,0.168928,-0.327523,0.28634,-0.13564,-0.097357,0.132369,-0.685028
4,560000.0,-0.099081,3.60443,0.071807,2.280483,-0.136271,-0.033714,-0.143441,0.73984,-0.034038,...,-0.000931,0.032677,-0.093392,-0.036924,-0.684056,0.541267,0.165751,-0.104619,0.181542,-0.492672


## Prepare Data
Now that we have cleaned, standardized, and completed dimensionality reduction for our data, let's prep it for our model. We need to extract the price feature to use for our target vector (y). It is the first column of each DataFrame.

In [131]:
# Create target vectors
y_active = active.iloc[:,0]
y_past = past.iloc[:,0]

In [132]:
y_active.head()

0    590000.0
1    589000.0
2    664900.0
3    435000.0
4    400000.0
Name: 0, dtype: float64

In [133]:
y_past.shape

(836,)

In [134]:
# drop price column from X features
X_active = active.drop(active.columns[0], axis=1)
X_past = past.drop(past.columns[0], axis=1)

In [135]:
X_active.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,76,77,78,79,80,81,82,83,84,85
0,-0.38399,2.188086,0.32575,1.335988,0.003251,-0.204843,1.428275,-1.401699,-0.134462,-0.471004,...,-0.000732,-0.101182,-0.661446,0.482604,-0.735396,0.547869,0.169401,-0.218324,-0.172415,0.231593
1,1.369588,0.018237,0.948351,-3.202843,0.500272,1.857549,1.780308,-1.859527,0.61254,-0.252534,...,-0.000137,0.173409,-0.725904,1.569119,0.470643,0.631998,0.609956,-0.262356,-0.034488,-0.233402
2,-3.9513,0.375523,-0.471736,-1.379998,0.326533,-1.055838,0.414315,-1.579257,-1.142416,-0.238487,...,-0.000833,-0.152648,-0.796099,0.477727,0.468923,-1.059683,-0.508418,0.011712,-0.571293,1.603781
3,1.496055,2.919255,0.142734,2.609945,1.167647,-0.566106,0.914542,-1.527745,-0.710405,-0.293602,...,-0.000571,-0.042684,-0.706107,0.090662,-0.905591,1.083773,0.79915,-0.261384,0.114413,-0.288909
4,0.509651,-0.767731,-0.320817,0.47061,-1.145476,-1.0156,-2.42836,-2.771265,-1.351204,0.347469,...,-0.001295,0.171488,-0.318554,-0.353788,-0.666221,0.332001,-0.734822,0.00322,-0.228598,0.806828


In [136]:
X_past.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,76,77,78,79,80,81,82,83,84,85
0,-0.818375,3.408697,0.240056,1.993104,-0.653233,-0.160618,-0.290284,0.754372,0.171967,-0.003675,...,-0.001048,-0.017664,-0.154569,0.07767,-0.91063,0.770455,0.173211,-0.117439,-0.122622,0.631282
1,-0.862253,3.655905,0.327998,2.054754,-0.439575,-0.048575,-0.441619,0.680315,0.27854,-0.005924,...,-0.000946,-0.015303,-0.010034,-0.266452,0.371394,0.444565,0.313832,-0.077385,0.20981,-0.327195
2,-0.856397,3.756488,0.116754,2.16332,-0.27785,-0.204776,-0.271301,0.712407,0.141604,0.007767,...,-0.000958,-0.001718,-0.04933,-0.079142,-0.232725,0.663266,0.301571,-0.110841,0.084703,0.039595
3,-1.436549,3.568422,0.423014,1.428297,-0.991835,0.07392,0.08076,0.785621,0.427419,-0.090199,...,-0.001061,-0.003625,-0.070065,0.168928,-0.327523,0.28634,-0.13564,-0.097357,0.132369,-0.685028
4,-0.099081,3.60443,0.071807,2.280483,-0.136271,-0.033714,-0.143441,0.73984,-0.034038,-0.003269,...,-0.000931,0.032677,-0.093392,-0.036924,-0.684056,0.541267,0.165751,-0.104619,0.181542,-0.492672


In [137]:
# Split past sales into training and validation samples
X_train, X_test, y_train, y_test = train_test_split(X_past, y_past, test_size=0.2, train_size=0.8)

In [138]:
print("X_train shape: ", X_train.shape)
print("y_train shape: ", y_train.shape)

print("X_test shape: ", X_test.shape)
print("y_test shape: ", y_test.shape)

X_train shape:  (668, 85)
y_train shape:  (668,)
X_test shape:  (168, 85)
y_test shape:  (168,)


In [139]:
# Split training data into training and validation samples
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.3, train_size=0.7)

In [140]:
print("X_train shape: ", X_train.shape)
print("y_train shape: ", y_train.shape)

print("X_val shape: ", X_val.shape)
print("y_val shape: ", y_val.shape)

X_train shape:  (467, 85)
y_train shape:  (467,)
X_val shape:  (201, 85)
y_val shape:  (201,)


## Export Training Data to S3

In [141]:
# SageMaker session and role
session = sagemaker.Session()
role = sagemaker.get_execution_role()

# S3 bucket name
bucket = session.default_bucket()

In [142]:
# specify an output path
prefix = 'listings'
output_path = 's3://{}/{}'.format(bucket, prefix)

In [115]:
# Export X_train to S3
from io import StringIO
csv_buffer = StringIO()

X_train.to_csv(csv_buffer)
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket, 'X_train').put(Body = csv_buffer.getvalue())

{'ResponseMetadata': {'RequestId': '2BCB5957AD0C43DE',
  'HostId': 'u5KZ9yGWryPa5mk+9p1mpwow2Qc8u1O4RB4qCIHmcbvTBaMftYjonYGurdg6MhEe5HnRIVTDC6M=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'u5KZ9yGWryPa5mk+9p1mpwow2Qc8u1O4RB4qCIHmcbvTBaMftYjonYGurdg6MhEe5HnRIVTDC6M=',
   'x-amz-request-id': '2BCB5957AD0C43DE',
   'date': 'Thu, 05 Sep 2019 23:32:33 GMT',
   'etag': '"d31d61283834c120593340d17481ccfb"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"d31d61283834c120593340d17481ccfb"'}

In [116]:
# Export X_val to S3
X_val.to_csv(csv_buffer)
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket, 'X_val').put(Body = csv_buffer.getvalue())

{'ResponseMetadata': {'RequestId': 'D8CC317AAE6301F9',
  'HostId': '52PW1JM0UpzcIyX/CDYAdkZXSTikC5v9n0hJp688mYMdVyYXlEZUZBKaaH/TlH8ZfzLsccbcYZ8=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': '52PW1JM0UpzcIyX/CDYAdkZXSTikC5v9n0hJp688mYMdVyYXlEZUZBKaaH/TlH8ZfzLsccbcYZ8=',
   'x-amz-request-id': 'D8CC317AAE6301F9',
   'date': 'Thu, 05 Sep 2019 23:32:33 GMT',
   'etag': '"b12fc839903a84239a83c528fedbe03d"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"b12fc839903a84239a83c528fedbe03d"'}

In [123]:
# Export y_train to S3
y_train.to_csv(csv_buffer, header=False)
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket, 'y_train').put(Body = csv_buffer.getvalue())

{'ResponseMetadata': {'RequestId': '12DEAF9195423B08',
  'HostId': 'MJABGjaYb3N0Htwwn2JxLxt12lbdnlOpKSxBev7nW0U6v5YNc/4hRYNDwNRuY4L4LFeVkqHpn0I=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'MJABGjaYb3N0Htwwn2JxLxt12lbdnlOpKSxBev7nW0U6v5YNc/4hRYNDwNRuY4L4LFeVkqHpn0I=',
   'x-amz-request-id': '12DEAF9195423B08',
   'date': 'Thu, 05 Sep 2019 23:39:50 GMT',
   'etag': '"1f9d1a0e25b674af35a002d35e0c2012"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"1f9d1a0e25b674af35a002d35e0c2012"'}

In [124]:
# Export y_val to S3
y_val.to_csv(csv_buffer, header=False)
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket, 'y_val').put(Body = csv_buffer.getvalue())

{'ResponseMetadata': {'RequestId': '51FD12398DB94ED5',
  'HostId': 'iWZq/T2PpB8JJmUG44YKIluWocVzzTYecxTqGEWqQEljKDj5E+FgWFpDAfX5ELmndLUw8c5lOiA=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'iWZq/T2PpB8JJmUG44YKIluWocVzzTYecxTqGEWqQEljKDj5E+FgWFpDAfX5ELmndLUw8c5lOiA=',
   'x-amz-request-id': '51FD12398DB94ED5',
   'date': 'Thu, 05 Sep 2019 23:39:51 GMT',
   'etag': '"ac3f8e8eb1dde182b58d53dcf046b9dd"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"ac3f8e8eb1dde182b58d53dcf046b9dd"'}

## Define Model
We will use SageMaker's built in XGBoost model.

In [117]:
from sagemaker.amazon.amazon_estimator import get_image_uri
region_name = 'us-east-2'
container = get_image_uri(boto3.Session().region_name,
                          'xgboost', 
                          repo_version = '0.90-1'); 

In [118]:
estimator = sagemaker.estimator.Estimator(container,
                                          role = role, 
                                          train_instance_count = 1, 
                                          train_instance_type = 'ml.c4.xlarge', 
                                          output_path = output_path, 
                                          sagemaker_session = session
                                          );

In [120]:
%%time
from time import gmtime, strftime

job_name = 'listings-xgboost' + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print("Training job", job_name)

create_training_params = \
{
    "AlgorithmSpecification": {
        "TrainingImage": container,
        "TrainingInputMode": "File"
    },
    "RoleArn": role,
    "OutputDataConfig": {
        "S3OutputPath": 's3://' + bucket + "/" + "single-xgboost"
    },
    "ResourceConfig": {
        "InstanceCount": 1,
        "InstanceType": "ml.m4.4xlarge",
        "VolumeSizeInGB": 5
    },
    "TrainingJobName": job_name,
    "HyperParameters": {
        "max_depth":"5",
        "eta":"0.2",
        "gamma":"4",
        "min_child_weight":"6",
        "subsample":"0.7",
        "silent":"0",
        "objective":"reg:linear",
        "num_round":"50"
    },
    "StoppingCondition": {
        "MaxRuntimeInSeconds": 8400
    },
    "InputDataConfig": [
        {
            "ChannelName": "train",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": 's3://' + bucket + "/" + 'X_train',
                    "S3DataDistributionType": "FullyReplicated"
                }
            },
            "ContentType": "libsvm",
            "CompressionType": "None"
        },
        {
            "ChannelName": "validation",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": 's3://' + bucket + "/" + 'X_val',
                    "S3DataDistributionType": "FullyReplicated"
                }
            },
            "ContentType": "libsvm",
            "CompressionType": "None"
        }
    ]
}


client = boto3.client('sagemaker', region_name = region_name)
client.create_training_job(**create_training_params)

Training job listings-xgboost2019-09-05-23-33-06
CPU times: user 16.6 ms, sys: 0 ns, total: 16.6 ms
Wall time: 195 ms


In [125]:
# estimator.fit({'train':'s3://{}/X_train.csv'.format(bucket), 'validation':'s3://{}/X_val.csv'.format(bucket)})