In [47]:
import  sagemaker
from sagemaker import get_execution_role

import numpy as np
import pandas as pd
import boto3
import re

In [48]:
bucket_name = 'awsdataforml'
train_data=r'bcancer/training/'
val_data=r'bcancer/validation/'

s3_model_output_location=r's3://{0}/bcancer/model'.format(bucket_name)
s3_training_file_location=r's3://{0}/{1}'.format(bucket_name,train_data)
s3_validation_file_location=r's3://{0}/{1}'.format(bucket_name,val_data)


In [49]:
print(s3_model_output_location)
print(s3_training_file_location)
print(s3_validation_file_location)

s3://awsdataforml/bcancer/model
s3://awsdataforml/bcancer/training/
s3://awsdataforml/bcancer/validation/


In [50]:
def write_to_s3(filename,bucket,key):
    with open(filename,'rb') as f:
        return boto3.Session().resource('s3').Bucket(bucket).Object(key).upload_fileobj(f)
    

In [51]:
write_to_s3('train.csv',bucket_name,train_data+'train.csv')
write_to_s3('test.csv',bucket_name,val_data+'test.csv')


In [52]:
sess = sagemaker.Session()
role = get_execution_role()
role

'arn:aws:iam::905255159123:role/service-role/AmazonSageMaker-ExecutionRole-20200713T003663'

In [53]:
#aws stores all models in a docker container
#region , model name,version
container = sagemaker.amazon.amazon_estimator.get_image_uri(
                sess.boto_region_name,
                "xgboost",
                "latest"  )
print("Sagemeaker xgboost info : {} ({})".format(container,sess.boto_region_name))

	get_image_uri(region, 'xgboost', '1.0-1').


Sagemeaker xgboost info : 811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest (us-east-1)


In [54]:
#building model

estimator = sagemaker.estimator.Estimator(
            container,
            role,
            train_instance_count=1,
            train_instance_type="ml.m4.xlarge",
            output_path=s3_model_output_location,
            sagemaker_session=sess,
            base_job_name = "v1-xgboost-bcancer")



In [55]:
#setting hyperparameter num_round -- num_estimators
estimator.set_hyperparameters(max_depth=3,objective="binary:logistic",num_round=500)

In [56]:
estimator.hyperparameters()

{'max_depth': 3, 'objective': 'binary:logistic', 'num_round': 500}

In [57]:
#specify file for training and validation

training_input_config = sagemaker.session.s3_input(
                        s3_data=s3_training_file_location,
                        content_type="csv",
                        s3_data_type="S3Prefix")

validation_input_config = sagemaker.session.s3_input(
                        s3_data=s3_validation_file_location,
                        content_type="csv",
                        s3_data_type="S3Prefix")

data_channels={'train':training_input_config,'validation':validation_input_config}

print(training_input_config.config)
print(validation_input_config.config)



{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://awsdataforml/bcancer/training/', 'S3DataDistributionType': 'FullyReplicated'}}, 'ContentType': 'csv'}
{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://awsdataforml/bcancer/validation/', 'S3DataDistributionType': 'FullyReplicated'}}, 'ContentType': 'csv'}


In [58]:
estimator.fit(data_channels)

2020-07-13 18:57:10 Starting - Starting the training job...
2020-07-13 18:57:12 Starting - Launching requested ML instances......
2020-07-13 18:58:28 Starting - Preparing the instances for training......
2020-07-13 18:59:36 Downloading - Downloading input data...
2020-07-13 18:59:56 Training - Downloading the training image..[34mArguments: train[0m
[34m[2020-07-13:19:00:17:INFO] Running standalone xgboost training.[0m
[34m[2020-07-13:19:00:17:INFO] File size need to be processed in the node: 0.44mb. Available memory size in the node: 8482.05mb[0m
[34m[2020-07-13:19:00:17:INFO] Determined delimiter of CSV input is ','[0m
[34m[19:00:17] S3DistributionType set as FullyReplicated[0m
[34m[19:00:17] 571x31 matrix with 17701 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2020-07-13:19:00:17:INFO] Determined delimiter of CSV input is ','[0m
[34m[19:00:17] S3DistributionType set as FullyReplicated[0m
[34m[19:00:17] 143x31 matrix with 

In [61]:
#deploy the model and save the model
#initial_instance_count  denotes the total no of request model will recieve
predictor = estimator.deploy(initial_instance_count=1,
                            instance_type='ml.m4.xlarge',
                            endpoint_name='v4-xgboost-bcancer')





---------------!

### starting prection

In [69]:
from sagemaker.predictor import csv_serializer,json_deserializer
#input is csv file content type is csv
predictor.serializer=csv_serializer
predictor.deserializer=None
predictor.content_type = 'test/csv'

In [75]:
#get endpoint name
endpoint_name = 'v4-xgboost-bcancer'
predictor = sagemaker.predictor.RealTimePredictor(endpoint=endpoint_name)

In [76]:
df_test = pd.read_csv("test.csv",header = None)
df_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,1,-0.032358,0.049468,-0.034141,0.058776,0.024585,0.178226,0.172732,0.210889,0.20496,...,0.131871,0.071645,0.135253,0.084355,0.30258,0.161123,0.191402,0.318488,0.096601,0.133214
1,1,0.966655,0.629122,0.236062,0.656008,0.78308,0.108691,0.289734,0.643629,0.596326,...,0.703337,0.151726,0.716862,0.829094,0.022,0.16623,0.328364,0.508226,-0.051204,-0.063465
2,1,-0.007328,0.082722,0.101937,0.099394,0.05604,0.176468,0.239854,0.265535,0.238806,...,0.146926,0.135638,0.123539,0.099072,0.339019,0.21289,0.314304,0.308776,0.096719,0.250981
3,1,-0.032332,0.221862,0.088826,0.218722,0.192288,0.039543,0.1064,0.152959,0.17832,...,0.23401,0.149129,0.197282,0.184158,0.050894,0.102471,0.1008,0.204741,0.034134,0.004878
4,0,-0.02344,-0.24844,-0.128497,-0.243446,-0.175563,-0.121245,-0.083924,-0.097445,-0.124797,...,-0.223699,-0.212613,-0.20913,-0.1422,-0.203847,-0.125608,-0.141996,-0.230193,-0.092008,-0.063202


In [77]:
arr_test =  df_test[df_test.columns[1:]].values
arr_test.shape

(143, 31)

In [78]:
arr_test[0]

array([-0.03235795,  0.04946806, -0.03414121,  0.05877615,  0.02458482,
        0.17822639,  0.17273243,  0.21088944,  0.20495963,  0.11825181,
        0.10036421,  0.04670021, -0.01801719,  0.04455317,  0.02502207,
        0.00280745,  0.01451559,  0.01965692,  0.05132337, -0.04647002,
        0.00894406,  0.1318715 ,  0.07164464,  0.13525271,  0.08435517,
        0.30257967,  0.16112256,  0.19140161,  0.31848832,  0.09660109,
        0.13321403])

In [None]:
predictor.predict(arr_test[0])