In [1]:
import sagemaker
from sagemaker import get_execution_role
import pandas as pd
import numpy as np
import boto3
import re
print('Libraries Loaded!')

Libraries Loaded!


In [2]:
#upload data to S3
bucket_name = 'awsmlprac'

train_data = r'bcancer/training/'
val_data = r'bcancer/validation/'

s3_model_output_location = r's3://{0}/bcancer/model'.format(bucket_name)
s3_training_file_location = r's3://{0}/{1}'.format(bucket_name, train_data)
s3_validation_file_location = r's3://{0}/{1}'.format(bucket_name, val_data)

print('Model artifacts will be saved to {}'.format(s3_model_output_location))
print('Model training data file location is {}'.format(s3_training_file_location))
print('Model validation data file location is {}'.format(s3_validation_file_location))

Model artifacts will be saved to s3://awsmlprac/bcancer/model
Model training data file location is s3://awsmlprac/bcancer/training/
Model validation data file location is s3://awsmlprac/bcancer/validation/


In [3]:
def write_to_s3(filename, bucket, key):
    with open(filename, 'rb') as f:
        return boto3.Session().resource('s3').Bucket(bucket).Object(key).upload_fileobj(f)

In [4]:
write_to_s3('train.csv',
           bucket_name,
           train_data + 'train.csv')

write_to_s3('test.csv',
           bucket_name,
           val_data + 'test.csv')

#### Model Training

In [5]:
sess = sagemaker.Session()
role = get_execution_role()
print(role)

arn:aws:iam::728861006721:role/service-role/AmazonSageMaker-ExecutionRole-20200519T155910


In [6]:
#get image
container = sagemaker.amazon.amazon_estimator.get_image_uri(sess.boto_region_name,
                                                           "xgboost",
                                                           "latest")
print('SageMaker XGBoost info: \n{} ({})'.format(container, sess.boto_region_name))

	get_image_uri(region, 'xgboost', '0.90-1').


SageMaker XGBoost info: 
991648021394.dkr.ecr.ap-south-1.amazonaws.com/xgboost:latest (ap-south-1)


In [7]:
#building model
estimator = sagemaker.estimator.Estimator(container,
                                         role,
                                         train_instance_count = 1,
                                         train_instance_type = 'ml.m4.xlarge',
                                         output_path = s3_model_output_location,
                                         sagemaker_session = sess,
                                         base_job_name = 'v1-xgboost-bcancer')

In [8]:
#set hyper parameters
estimator.set_hyperparameters(max_depth = 3, objective = 'binary:logistic',
                             num_round = 500)

In [9]:
estimator.hyperparameters()

{'max_depth': 3, 'objective': 'binary:logistic', 'num_round': 500}

In [10]:
#specify the files for training and test
train_input_config = sagemaker.session.s3_input(s3_data=s3_training_file_location,
                                                  content_type = 'csv',
                                                  s3_data_type = 'S3Prefix')
validation_input_config = sagemaker.session.s3_input(s3_data=s3_validation_file_location,
                                                  content_type = 'csv',
                                                  s3_data_type = 'S3Prefix')

data_channels = {'train': train_input_config, 'validation':validation_input_config}
print(train_input_config.config)
print(validation_input_config.config)

{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://awsmlprac/bcancer/training/', 'S3DataDistributionType': 'FullyReplicated'}}, 'ContentType': 'csv'}
{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://awsmlprac/bcancer/validation/', 'S3DataDistributionType': 'FullyReplicated'}}, 'ContentType': 'csv'}


In [11]:
#start training
estimator.fit(data_channels)

2020-05-19 20:15:05 Starting - Starting the training job...
2020-05-19 20:15:07 Starting - Launching requested ML instances...
2020-05-19 20:16:03 Starting - Preparing the instances for training.........
2020-05-19 20:17:34 Downloading - Downloading input data
2020-05-19 20:17:34 Training - Downloading the training image...
2020-05-19 20:18:06 Uploading - Uploading generated training model
2020-05-19 20:18:06 Completed - Training job completed
[34mArguments: train[0m
[34m[2020-05-19:20:17:54:INFO] Running standalone xgboost training.[0m
[34m[2020-05-19:20:17:54:INFO] File size need to be processed in the node: 0.42mb. Available memory size in the node: 8505.0mb[0m
[34m[2020-05-19:20:17:54:INFO] Determined delimiter of CSV input is ','[0m
[34m[20:17:54] S3DistributionType set as FullyReplicated[0m
[34m[20:17:55] 571x30 matrix with 17130 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2020-05-19:20:17:55:INFO] Determined delimiter o

Training seconds: 55
Billable seconds: 55


In [12]:
#deploy model
predictor = estimator.deploy(initial_instance_count = 1,
                            instance_type = 'ml.m4.xlarge',
                            endpoint_name = 'v2-xgboost-bcancer')

-------------!

##### Prediction

In [13]:
from sagemaker.predictor import csv_serializer, json_deserializer
predictor.serializer = csv_serializer
predictor.deserializer = None
predictor.content_type = 'text/csv'

In [14]:
#get a realtime endpoint
endpoint_name = 'v2-xgboost-bcancer'
predictor = sagemaker.predictor.RealTimePredictor(endpoint = endpoint_name)

In [15]:
df_test = pd.read_csv('test.csv', header = None)
df_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,21,22,23,24,25,26,27,28,29,30
0,1,0.29236,0.082784,0.282703,0.26443,-0.045027,0.024042,0.023652,0.144536,-0.099429,...,0.287815,0.01897,0.270746,0.222961,-0.005801,-0.016519,-0.001696,0.213438,-0.058544,-0.109718
1,1,0.148739,0.207317,0.141186,0.116187,-0.117814,0.020118,0.074041,0.041953,-0.17405,...,0.14624,0.260735,0.127192,0.099149,0.074169,0.153229,0.184833,0.203759,-0.078998,-0.029554
2,0,-0.213985,-0.082842,-0.209585,-0.158808,-0.104273,-0.036995,-0.010729,-0.12922,0.113324,...,-0.196343,-0.069489,-0.179348,-0.129518,-0.059886,0.084733,0.129322,-0.114798,0.016015,0.093823
3,1,0.0768,0.032477,0.073464,0.045137,0.055428,0.049258,0.060686,0.078036,0.062314,...,0.117069,0.122942,0.092827,0.050977,0.150772,0.187963,0.100169,0.114412,0.215305,0.147279
4,0,0.024266,-0.082504,0.015486,0.004713,-0.089828,-0.114597,-0.086409,-0.104419,-0.203343,...,0.006788,-0.006323,-0.006281,-0.012186,-0.11998,-0.045857,-0.022754,-0.124832,-0.087868,-0.117971


In [16]:
#convert into array
arr_test = df_test[df_test.columns[1:]].values
arr_test[0]

array([ 0.29235996,  0.08278374,  0.28270341,  0.26442997, -0.04502654,
        0.02404165,  0.02365156,  0.14453646, -0.09942882, -0.15404987,
        0.06951005, -0.06869364,  0.06909618,  0.06394245, -0.0337305 ,
       -0.04992546, -0.02080663,  0.05174653, -0.0864745 , -0.06590675,
        0.28781464,  0.01896955,  0.2707463 ,  0.22296053, -0.00580121,
       -0.01651891, -0.00169642,  0.21343805, -0.05854376, -0.10971822])

In [17]:
type(arr_test[0])

numpy.ndarray

In [20]:
#result = predictor.predict(arr_test[0])
#result

In [19]:
predictor.delete_endpoint()