# UDACITY SageMaker Essentials: Training Job Demo

In [3]:
import sagemaker
from sagemaker import get_execution_role
from sagemaker import image_uris
from sagemaker.predictor import csv_serializer

session = sagemaker.Session()

role = get_execution_role()

print(role)

# If you're following along, you'll need to upload these datasets to your own bucket in S3. 

test_location = 's3://soumyadefaultbucket1001/test.csv'
val_location = 's3://soumyadefaultbucket1001/validation.csv'
train_location = 's3://soumyadefaultbucket1001/train.csv'

# We use this prefix to help us determine where the output will go. 

prefix = 's3://soumyadefaultbucket1001/output-data/'

# We need to get the location of the container. 

container = image_uris.retrieve('xgboost', session.boto_region_name, version='latest')

# Now that we know which container to use, we can construct the estimator object.
xgb = sagemaker.estimator.Estimator(container, # The image name of the training container
                                    role,      # The IAM role to use (our current role in this case)
                                    instance_count=1, # The number of instances to use for training
                                    instance_type='ml.m4.xlarge', # The type of instance to use for training
                                    #output_path='s3://{}/{}/output'.format(session.default_bucket(), prefix),
                                                                        # Where to save the output (the model artifacts)
                                    output_path='s3://soumyadefaultbucket1001/output-data/',
                                    sagemaker_session=session) # The current SageMaker session
             
# These hyperparameters are beyond the scope of this course, but you can research the algoirthm here: 
# https://docs.aws.amazon.com/sagemaker/latest/dg/xgboost.html    
    
xgb.set_hyperparameters(max_depth=5,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        objective='reg:linear',
                        early_stopping_rounds=10,
                        num_round=200)
                        
s3_input_train = sagemaker.inputs.TrainingInput(s3_data=train_location, content_type='csv')
s3_input_validation = sagemaker.inputs.TrainingInput(s3_data=val_location, content_type='csv')

# The fit method launches the training job. 

xgb.fit({'train': s3_input_train, 'validation': s3_input_validation})


arn:aws:iam::920440760924:role/service-role/AmazonSageMaker-ExecutionRole-20220503T084501
2022-05-03 04:28:31 Starting - Starting the training job...
2022-05-03 04:28:59 Starting - Preparing the instances for trainingProfilerReport-1651552111: InProgress
............
2022-05-03 04:31:01 Downloading - Downloading input data......
2022-05-03 04:31:56 Training - Downloading the training image......
2022-05-03 04:32:52 Uploading - Uploading generated training model[34mArguments: train[0m
[34m[2022-05-03:04:32:45:INFO] Running standalone xgboost training.[0m
[34m[2022-05-03:04:32:45:INFO] File size need to be processed in the node: 0.02mb. Available memory size in the node: 8500.18mb[0m
[34m[2022-05-03:04:32:45:INFO] Determined delimiter of CSV input is ','[0m
[34m[04:32:45] S3DistributionType set as FullyReplicated[0m
[34m[04:32:45] 227x13 matrix with 2951 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2022-05-03:04:32:45:INFO] Deter