In [217]:
# Import all the required libraries
# Import sagemaker and Boto3 to connect to aws.

import numpy as np
import pandas as pd

# Define IAM role
import boto3
import re
import sagemaker
from sagemaker import get_execution_role
from sagemaker.amazon.amazon_estimator import get_image_uri
# Sagemaker SDK Documnetation https://sagemaker.readthedocs.io/en/stable/estimators.html

In [218]:
# Upload data to S3

In [219]:
bucket_name = 'sara-ml-sagemaker'
training_file_key = 'bikeshare/bikesharing_train.csv'
validation_file_key = 'bikeshare/bikesharing_valida.csv'
test_file_key = 'bikeshare/bike_test.csv'

In [220]:
s3_model_output_location = r's3://{0}/bikeshare/model'.format(bucket_name)
s3_training_file_location = r's3://{0}/{1}'.format(bucket_name, training_file_key)
s3_validation_file_location = r's3://{0}/{1}'.format(bucket_name, validation_file_key)
s3_test_file_location = r's3://{0}/{1}'.format(bucket_name, test_file_key)
#s3_test_file_location = r's3://{0}/{1}'.format(bucket_name{0}, test_file_key{1}) replace {0} with bucket name
# & {1} with file_key. This is positional arguments as described in 
#  {0} {1}.format(positional_arguments) , key_words can also be used instead on positional arguments

In [221]:
s3_model_output_location, s3_training_file_location, s3_validation_file_location, s3_test_file_location

('s3://sara-ml-sagemaker/bikeshare/model',
 's3://sara-ml-sagemaker/bikeshare/bikesharing_train.csv',
 's3://sara-ml-sagemaker/bikeshare/bikesharing_valida.csv',
 's3://sara-ml-sagemaker/bikeshare/bike_test.csv')

In [222]:
# Writing files to s3
# Files are referred to as objects in S3
# File name is referred to as key in s3
# Files stored in s3 are automatically replicated across 3 availability zones. 

#https://boto3.amazonaws.com/v1/documentation/api/latest/guide/s3-uploading-files.html

def write_to_s3(filename, bucket, key):
    with open(filename, 'rb') as f: # read in binary mode
        return boto3.Session().resource('s3').Bucket(bucket_name).Object(key).upload_fileobj(f)

In [223]:
# All the data in the csv files should be without any headers.
# This transformation has been performed in the biketrain_data_prep3 notebook.

write_to_s3('bikesharing_train.csv', bucket_name, training_file_key)
write_to_s3('bikesharing_valida.csv', bucket_name, validation_file_key)
write_to_s3('bike_test.csv', bucket_name, test_file_key)

Training Algorithm Docker Image

AWS Maintains a separate registry for every region and algorithm 

In [224]:
# Registry path for algorithms provided by sagemaker. 
# https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-algo-docker-registry-paths.html

# Create a dictionary mapping the region to the container path.
# repo_version lets you specify the version of the model you want to use. Specify'latest' to use newest version. 
# If repo_version is not specified then the version will default to 1. 

container = get_image_uri(boto3.Session().region_name, 'xgboost', repo_version="latest")

In [225]:
container

'811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest'

In [226]:
role = get_execution_role()

In [227]:
# The role contains the permissions to train and deplay the model
# Sagemaker service will be assuming this role. 
role

'arn:aws:iam::252379044400:role/service-role/AmazonSageMaker-ExecutionRole-20190219T163658'

Build Model

In [228]:
# Configure the training job
# To train we first need to establish a sagemaker session. 

session = sagemaker.Session()

In [229]:
# Access appropriate algorithm container image
# Specify how many instances to use for distributed training and what type of machine ot use. 
# Finally specify where the trained model artifacts need to be stored. 
# Optinal, give a name to the training job using base_job_name

# https://sagemaker.readthedocs.io/en/stable/estimators.html

estimator = sagemaker.estimator.Estimator(container, # Retrive tyhe container path based on where the code is running. 
                                         role, # Pass the role that the model can asssume to access data files
                                         train_instance_count = 1, # Training is carried ou on single compute instance
                                         train_instance_type = 'ml.m4.xlarge', # Training is carried out on the instance type
                                         output_path = s3_model_output_location, # Model is saved to output location
                                         sagemaker_session = session, # we also need to declare the sagemaker session details
                                         base_job_name = 'xgboost-bikeshare-v1')# we can also give a name to the training job

In [230]:
# Specify hyperparameters that are appropriate for the training job. 
# https://xgboost.readthedocs.io/en/latest/parameter.html

# max_depth=5, eta=0.1, subsample=0.7, num_round=150

estimator.set_hyperparameters(max_depth=5, objective="reg:linear", 
                              eta=0.1, subsample=0.7, num_round=150)

In [231]:
estimator.hyperparameters()

{'max_depth': 5,
 'objective': 'reg:linear',
 'eta': 0.1,
 'subsample': 0.7,
 'num_round': 150}

Specify training data location and validation data location 

In [232]:
# Training and validation file location need to specified using the s3 input config class

training_input_config = sagemaker.session.s3_input(s3_data=s3_training_file_location, content_type="csv")
validation_input_config = sagemaker.session.s3_input(s3_data=s3_validation_file_location, content_type="csv")

In [233]:
training_input_config.config, validation_input_config.config

({'DataSource': {'S3DataSource': {'S3DataDistributionType': 'FullyReplicated',
    'S3DataType': 'S3Prefix',
    'S3Uri': 's3://sara-ml-sagemaker/bikeshare/bikesharing_train.csv'}},
  'ContentType': 'csv'},
 {'DataSource': {'S3DataSource': {'S3DataDistributionType': 'FullyReplicated',
    'S3DataType': 'S3Prefix',
    'S3Uri': 's3://sara-ml-sagemaker/bikeshare/bikesharing_valida.csv'}},
  'ContentType': 'csv'})

Train the Model 

In [234]:
# Train the Model 
# XGBoost supports "training" and "validation" channels
# https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-algo-docker-registry-paths.html

estimator.fit({'train':training_input_config, 'validation': validation_input_config})

2019-06-13 02:02:30 Starting - Starting the training job.........
2019-06-13 02:03:31 Starting - Launching requested ML instances......
2019-06-13 02:04:39 Starting - Preparing the instances for training......
2019-06-13 02:05:50 Downloading - Downloading input data...
2019-06-13 02:06:29 Training - Training image download completed. Training in progress..
[31mArguments: train[0m
[31m[2019-06-13:02:06:30:INFO] Running standalone xgboost training.[0m
[31m[2019-06-13:02:06:30:INFO] File size need to be processed in the node: 0.63mb. Available memory size in the node: 8421.62mb[0m
[31m[2019-06-13:02:06:30:INFO] Determined delimiter of CSV input is ','[0m
[31m[02:06:30] S3DistributionType set as FullyReplicated[0m
[31m[02:06:30] 7620x13 matrix with 99060 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[31m[2019-06-13:02:06:30:INFO] Determined delimiter of CSV input is ','[0m
[31m[02:06:30] S3DistributionType set as FullyReplicated[0m
[


2019-06-13 02:06:43 Uploading - Uploading generated training model
2019-06-13 02:06:43 Completed - Training job completed
Billable seconds: 53


Deploy the Model

In [238]:
# https://sagemaker.readthedocs.io/en/stable/estimators.html
# Add update_endpoint = True after the first round. So that the created end point is updated, with the changes

predictor = estimator.deploy(initial_instance_count=1, 
                            instance_type='ml.m4.xlarge', 
                            endpoint_name = 'xgboost-bikeshare-v2')

----------------------------------------------------------------------------------------------------!

Run Predictions

In [239]:
from sagemaker.predictor import csv_serializer, json_deserializer

predictor.content_type = 'text/csv'
predictor.serializer = csv_serializer
predictor.deserializer = None

In [240]:
predictor.predict([1,0,1,1,10.66,12.88,56,11.0014,2011,1,20,3,3])

b'1.15976893902'