### XGBoost cloud training template for Diabetes Prediction

In [1]:
import numpy as np
import pandas as pd

import boto3
import re
import sagemaker
from sagemaker import get_execution_role

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [2]:
# Import s3 bucket name from .env file

import os
env_vars = !cat ./.env
for var in env_vars:
    key, value = var.split('=')
    os.environ[key]=value

In [3]:
bucket_name = os.environ['BUCKET_NAME']

In [None]:
bucket_name

In [5]:
# bucket name and folders for Diabetes files in s3

training_file_key = 'diabetes/diabetes_train.csv'
validation_file_key = 'diabetes/diabetes_validation.csv'

s3_model_output_location = r's3://{0}/diabetes/model'.format(bucket_name)
s3_training_file_location = r's3://{0}/{1}'.format(bucket_name, training_file_key)
s3_validation_file_location = r's3://{0}/{1}'.format(bucket_name, validation_file_key)

In [None]:
print(s3_model_output_location)
print(s3_training_file_location)
print(s3_validation_file_location)

In [7]:
# function to write files into s3

def write_to_s3(filename, bucket, key):
    with open(filename,'rb') as f:
        return boto3.Session().resource('s3').Bucket(bucket).Object(key).upload_fileobj(f)

In [8]:
write_to_s3('diabetes_train_imputed.csv',bucket_name,training_file_key)
write_to_s3('diabetes_validation_imputed.csv',bucket_name, validation_file_key)

In [9]:
diabetes_training_column_list_file_key = 'diabetes/diabetes_training_column_list.txt'
s3_training_column_list_file_location = r's3://{0}/{1}'.format(bucket_name, diabetes_training_column_list_file_key)

In [10]:
write_to_s3('diabetes_training_column_list.txt',bucket_name, diabetes_training_column_list_file_key)

### Training XGBoost Algorithm in Docker Image

#### using the AWS maintained xgboost image for every region

In [9]:
# Use spot instance

use_spot_instance = True
max_run = 3600
max_wait = 7200

job_name = 'xgboost-diabetes-v1'

checkpoint_s3_uri = None

if use_spot_instance:
    checkpoint_s3_uri = f's3://{bucket_name}/diabetes/checkpoints/{job_name}'

In [10]:
sess = sagemaker.Session()

In [11]:
role = get_execution_role()

In [None]:
print(role)

In [13]:
#container = sagemaker.image_uris.retrieve("xgboost",sess.boto_region_name, version='1.2-2')
container = sagemaker.image_uris.retrieve("xgboost",sess.boto_region_name, version='1.7-1')

print(f'Using XGBoost container:{container}')

Using XGBoost container:683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:1.7-1


### Build Model

In [14]:
# Configure the training job
# Specify type and number of instances will be used
# s3 location where final artifacts needs to be stored
#   Reference: http://sagemaker.readthedocs.io/en/latest/estimators.html

# for managed spot training, specify the use_spot_instances flag, max_run, max_wait and checkpoint_s3_uri

# SDK 2.x version does not require train prefix for instance count and type

estimator = sagemaker.estimator.Estimator(
    container,
    role,
    instance_count=1,
    instance_type='ml.m5.xlarge',
    output_path = s3_model_output_location,
    sagemaker_session = sess,
    base_job_name=job_name,
    use_spot_instances=use_spot_instance,
    max_run=max_run,
    max_wait=max_wait,
    checkpoint_s3_uri=checkpoint_s3_uri
)

In [15]:
# Specify hyper parameters that appropriate for the training algorithm
# XGBoost Training parameter Reference:

estimator.set_hyperparameters(
    max_depth=5,
    objective='binary:logistic',
    eval_metric='logloss',
    num_round=100,
    early_stopping_rounds=10
)

In [16]:
estimator.hyperparameters()

{'max_depth': 5,
 'objective': 'binary:logistic',
 'eval_metric': 'logloss',
 'num_round': 100,
 'early_stopping_rounds': 10}

In [17]:
### Preparing Training data locations

training_input_config = sagemaker.session.TrainingInput(
    s3_data = s3_training_file_location,
    content_type = 'csv',
    s3_data_type = 'S3Prefix'
)

validation_input_config = sagemaker.session.TrainingInput(
    s3_data = s3_validation_file_location,
    content_type = 'csv',
    s3_data_type = 'S3Prefix'
)

data_channels = {'train':training_input_config, 'validation':validation_input_config}

In [18]:
print(training_input_config)
print(validation_input_config)

<sagemaker.inputs.TrainingInput object at 0x7ff713bf7b20>
<sagemaker.inputs.TrainingInput object at 0x7ff718183ee0>


In [19]:
print(data_channels)

{'train': <sagemaker.inputs.TrainingInput object at 0x7ff713bf7b20>, 'validation': <sagemaker.inputs.TrainingInput object at 0x7ff718183ee0>}


### Train the model

In [20]:
# XGBoost supports "train", "validation" data channels

estimator.fit(data_channels)

INFO:sagemaker:Creating training-job with name: xgboost-diabetes-v1-2024-05-05-07-12-52-922


2024-05-05 07:12:53 Starting - Starting the training job...
2024-05-05 07:13:08 Starting - Preparing the instances for training...
2024-05-05 07:13:33 Downloading - Downloading input data...
2024-05-05 07:13:58 Downloading - Downloading the training image...
2024-05-05 07:14:39 Training - Training image download completed. Training in progress..[34m[2024-05-05 07:14:50.333 ip-10-2-68-165.ec2.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2024-05-05 07:14:50.360 ip-10-2-68-165.ec2.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2024-05-05:07:14:50:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2024-05-05:07:14:50:INFO] Failed to parse hyperparameter eval_metric value logloss to Json.[0m
[34mReturning the value itself[0m
[34m[2024-05-05:07:14:50:INFO] Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34m[2024-05-05:07:14:50:INFO] N

### Deploy Predictor Model

In [21]:
predictor = estimator.deploy(
                        initial_instance_count=1,
                        instance_type='ml.m5.xlarge',
                        endpoint_name=job_name)

INFO:sagemaker:Creating model with name: xgboost-diabetes-v1-2024-05-05-07-15-58-402
INFO:sagemaker:Creating endpoint-config with name xgboost-diabetes-v1
INFO:sagemaker:Creating endpoint with name xgboost-diabetes-v1


-----!

### Try Predicting

In [22]:
# SDK 2.0 serializers

from sagemaker.serializers import CSVSerializer
predictor.serializer = CSVSerializer()

In [23]:
predictor.predict(
    [[1.0,130.0,70.0,13.0,105.0,25.9,0.472,22],
[8.0,133.0,72.0,33.0,207.0,32.9,0.27,39],
[9.0,130.0,70.0,33.0,207.0,34.2,0.652,45],
[7.0,107.0,74.0,33.0,207.0,29.6,0.254,31]]
)

b'0.08117855340242386\n0.9971675276756287\n0.9987726807594299\n0.9963440299034119\n'

In [28]:
result = predictor.predict(
    [[1.0,130.0,70.0,13.0,105.0,25.9,0.472,22],
[8.0,133.0,72.0,33.0,207.0,32.9,0.27,39],
[9.0,130.0,70.0,33.0,207.0,34.2,0.652,45],
[7.0,107.0,74.0,33.0,207.0,29.6,0.254,31]]
)

In [30]:
#result = b'0.08117855340242386\n0.9971675276756287\n0.9987726807594299\n0.9963440299034119\n'.decode('ascii')
result = result.decode("utf-8").split('\n')
result = result[:len(result)-1]

In [31]:
result

['0.08117855340242386',
 '0.9971675276756287',
 '0.9987726807594299',
 '0.9963440299034119']

In [32]:
def proba_to_class(arr_res,margin=.5):
    
    return_res = []
    for val in arr_res:
        if float(val) >= margin:
            return_res.append(1)
        else:
            return_res.append(0)
    return return_res

In [33]:
proba_to_class(result)

[0, 1, 1, 1]