In [5]:
import numpy as np
import pandas as pd

import boto3
import re

import sagemaker
from sagemaker import get_execution_role

## Upload Data to S3

In [1]:
# Specify your bucket name
bucket_name = 'ruchi-ml-sagemaker'

training_folder = r'bikerental/training/'
validation_folder = r'bikerental/validation/'
test_folder = r'bikerental/test/'

s3_model_output_location = r's3://{0}/bikerental/model'.format(bucket_name)
s3_training_file_location = r's3://{0}/{1}'.format(bucket_name,training_folder)
s3_validation_file_location = r's3://{0}/{1}'.format(bucket_name,validation_folder)
s3_test_file_location = r's3://{0}/{1}'.format(bucket_name,test_folder)

In [2]:
print(s3_model_output_location)
print(s3_training_file_location)
print(s3_validation_file_location)
print(s3_test_file_location)

s3://ruchi-ml-sagemaker/bikerental/model
s3://ruchi-ml-sagemaker/bikerental/training/
s3://ruchi-ml-sagemaker/bikerental/validation/
s3://ruchi-ml-sagemaker/bikerental/test/


In [3]:
# Write and Reading from S3 
def write_to_s3(filename, bucket, key):
    with open(filename,'rb') as f: # Read in binary mode
        return boto3.Session().resource('s3').Bucket(bucket).Object(key).upload_fileobj(f)

In [6]:
write_to_s3('bike_train.csv', 
            bucket_name,
            training_folder + 'bike_train.csv')

write_to_s3('bike_validation.csv',
            bucket_name,
            validation_folder + 'bike_validation.csv')

write_to_s3('bike_test.csv',
            bucket_name,
            test_folder + 'bike_test.csv')

In [7]:
# Establish a session with AWS
sess = sagemaker.Session()

In [8]:
role = get_execution_role()

In [9]:
print(role)

arn:aws:iam::717750742956:role/service-role/AmazonSageMaker-ExecutionRole-20200503T152781


In [10]:
container = sagemaker.amazon.amazon_estimator.get_image_uri(
    sess.boto_region_name,
    "xgboost", 
    "latest")

print('Using SageMaker XGBoost container:\n{} ({})'.format(container, sess.boto_region_name))

	get_image_uri(region, 'xgboost', '0.90-1').


Using SageMaker XGBoost container:
811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest (us-east-1)


## Build Model

In [11]:
# Configure the training job
# Specify type and number of instances to use
# S3 location where final artifacts needs to be stored

estimator = sagemaker.estimator.Estimator(
    container,
    role, 
    train_instance_count=1, 
    train_instance_type='ml.m4.xlarge',
    output_path=s3_model_output_location,
    sagemaker_session=sess,
    base_job_name ='xgboost-bikerental-v1')

In [12]:
# Specify hyper parameters that appropriate for the training algorithm
# max_depth=5,eta=0.1,subsample=0.7,num_round=150
estimator.set_hyperparameters(max_depth=5,
                              objective="reg:linear",
                              eta=0.1,
                              num_round=150)

In [13]:
estimator.hyperparameters()

{'max_depth': 5, 'objective': 'reg:linear', 'eta': 0.1, 'num_round': 150}

### Specify Training Data Location and Optionally, Validation Data Location

In [14]:
training_input_config = sagemaker.session.s3_input(
    s3_data=s3_training_file_location,
    content_type='csv',
    s3_data_type='S3Prefix')

validation_input_config = sagemaker.session.s3_input(
    s3_data=s3_validation_file_location,
    content_type='csv',
    s3_data_type='S3Prefix'
)

data_channels = {'train': training_input_config, 'validation': validation_input_config}

In [15]:
print(training_input_config.config)
print(validation_input_config.config)

{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://ruchi-ml-sagemaker/bikerental/training/', 'S3DataDistributionType': 'FullyReplicated'}}, 'ContentType': 'csv'}
{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://ruchi-ml-sagemaker/bikerental/validation/', 'S3DataDistributionType': 'FullyReplicated'}}, 'ContentType': 'csv'}


### Train the model

In [16]:
estimator.fit(data_channels)

2020-05-04 20:34:57 Starting - Starting the training job...
2020-05-04 20:34:59 Starting - Launching requested ML instances......
2020-05-04 20:36:05 Starting - Preparing the instances for training......
2020-05-04 20:37:14 Downloading - Downloading input data...
2020-05-04 20:37:47 Training - Downloading the training image..[34mArguments: train[0m
[34m[2020-05-04:20:38:07:INFO] Running standalone xgboost training.[0m
[34m[2020-05-04:20:38:07:INFO] File size need to be processed in the node: 0.5mb. Available memory size in the node: 8479.16mb[0m
[34m[2020-05-04:20:38:07:INFO] Determined delimiter of CSV input is ','[0m
[34m[20:38:07] S3DistributionType set as FullyReplicated[0m
[34m[20:38:07] 7620x13 matrix with 99060 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2020-05-04:20:38:07:INFO] Determined delimiter of CSV input is ','[0m
[34m[20:38:07] S3DistributionType set as FullyReplicated[0m
[34m[20:38:07] 3266x13 matrix with


2020-05-04 20:38:19 Uploading - Uploading generated training model
2020-05-04 20:38:19 Completed - Training job completed
Training seconds: 65
Billable seconds: 65


## Deploy Model

In [17]:
predictor = estimator.deploy(initial_instance_count=1,
                             instance_type='ml.m4.xlarge',
                             endpoint_name = 'xgboost-bikerental-v1')

---------------!

## Run Predictions

In [None]:
from sagemaker.predictor import csv_serializer, json_deserializer

predictor.content_type = 'text/csv'
predictor.serializer = csv_serializer
predictor.deserializer = None

In [None]:
predictor.predict([[3,0,1,2,28.7,33.335,79,12.998,2011,7,7,3]])