In [59]:
import boto3
import sagemaker
from sagemaker import get_execution_role
from boto3.s3.transfer import TransferConfig

# Data transfer to s3

In [63]:
bucket_name= 'psn-sagemaker'
train_key= 'bsd/ctrain.csv'
val_key = 'bsd/cval.csv'
test_key = 'bsd/ctest.csv'
model_loc =  's3://' + bucket_name +'/bsd/model'
train_loc =  's3://' +bucket_name + '/'+ train_key
val_loc =    's3://' +bucket_name + '/' + val_key
test_loc = 's3://' +bucket_name +  '/' + test_key

train_loc, val_loc, test_loc, model_loc

('s3://psn-sagemaker/bsd/ctrain.csv',
 's3://psn-sagemaker/bsd/cval.csv',
 's3://psn-sagemaker/bsd/ctest.csv',
 's3://psn-sagemaker/bsd/model')

In [64]:
config = TransferConfig(multipart_threshold=8388608, max_concurrency=10, use_threads=True)
s3 = boto3.client('s3')
    
def transfer_file_to_s3(file_name, key_name, bucket_name = bucket_name, config = config, s3_client = s3):
    s3_client.upload_file(file_name, bucket_name, key_name , Config=config)
    
    

In [65]:
transfer_file_to_s3('ctrain.csv', train_key)
transfer_file_to_s3('cval.csv', val_key)
transfer_file_to_s3('ctest.csv', test_key)

# Finding right docker image for machine learning alogorithms
https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-algo-docker-registry-paths.html

In [66]:
docker_image_for_sg_boost = '811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest'

In [67]:
# sagemaker need acces to s3 bucket etc
role = get_execution_role()
role

'arn:aws:iam::993741996993:role/service-role/AmazonSageMaker-ExecutionRole-20190520T202818'

Trainig

In [68]:
train_sess = sagemaker.Session()

In [69]:
boto3.Session().region_name

'us-east-1'

In [71]:
estimator = sagemaker.estimator.Estimator(docker_image_for_sg_boost,
                                          role,
                                          1,
                                          'ml.m4.xlarge',
                                          output_path= model_loc,
                                          sagemaker_session = train_sess,
                                          base_job_name = 'ml-xgboost-bsdv1'                                  
                                     )

In [72]:
estimator.set_hyperparameters(max_depth=8,n_estimators=100, num_round= 100, learning_rate=.05, subsample=.8, objective="reg:linear")

In [73]:
estimator.hyperparameters()

{'max_depth': 8,
 'n_estimators': 100,
 'num_round': 100,
 'learning_rate': 0.05,
 'subsample': 0.8,
 'objective': 'reg:linear'}

# Formatting input and val input

In [76]:
train_input = sagemaker.session.s3_input(s3_data=train_loc, content_type="csv")
val_input = sagemaker.session.s3_input(s3_data=val_loc, content_type="csv")

In [77]:
train_input.config

{'DataSource': {'S3DataSource': {'S3DataDistributionType': 'FullyReplicated',
   'S3DataType': 'S3Prefix',
   'S3Uri': 's3://psn-sagemaker/bsd/ctrain.csv'}},
 'ContentType': 'csv'}

https://docs.aws.amazon.com/sagemaker/latest/dg/API_S3DataSource.html

# Fit the model

In [78]:
estimator.fit({'train': train_input, 'validation':val_input})

2019-06-04 00:13:12 Starting - Starting the training job............
2019-06-04 00:15:03 Starting - Launching requested ML instances......
2019-06-04 00:16:12 Starting - Preparing the instances for training......
2019-06-04 00:17:13 Downloading - Downloading input data...
2019-06-04 00:17:43 Training - Downloading the training image.
[31mArguments: train[0m
[31m[2019-06-04:00:17:47:INFO] Running standalone xgboost training.[0m
[31m[2019-06-04:00:17:47:INFO] File size need to be processed in the node: 0.72mb. Available memory size in the node: 8426.61mb[0m
[31m[2019-06-04:00:17:47:INFO] Determined delimiter of CSV input is ','[0m
[31m[00:17:47] S3DistributionType set as FullyReplicated[0m
[31m[00:17:47] 8708x15 matrix with 130620 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[31m[2019-06-04:00:17:47:INFO] Determined delimiter of CSV input is ','[0m
[31m[00:17:47] S3DistributionType set as FullyReplicated[0m
[31m[00:17:47] 2178x15 


2019-06-04 00:17:58 Uploading - Uploading generated training model
2019-06-04 00:17:58 Completed - Training job completed
Billable seconds: 45


# deploying and calling the prediction

In [79]:
predictor = estimator.deploy(initial_instance_count=1,
                            instance_type='ml.m4.xlarge',
                            endpoint_name= 'deploy-bsd-xgboost-v1')

---------------------------------------------------------------------------------------------------!

In [80]:
from sagemaker.predictor import csv_serializer
predictor.cotent_tye= 'text/csv'
predictor.serializer = csv_serializer
predictor.deserializer = None


In [81]:
predictor.predict([[3,0,1,2,27.88,31.82,89,19.9995,7,90,8,2011,4,4,6]])

b'4.58880138397'