# Train Xgboost model

In [None]:
import sagemaker
import boto3
from sagemaker.amazon.amazon_estimator import get_image_uri
bucket = 'toronto-house-price-project'
prefix='input'
prefix_output='models'
role = sagemaker.get_execution_role()

container = get_image_uri(boto3.Session().region_name, 'xgboost',repo_version='1.0-1') # built-in xgboost
train_data = 's3://{}/{}/{}'.format(bucket, prefix, 'train')
validation_data = 's3://{}/{}/{}'.format(bucket, prefix, 'validation')

train_channel = sagemaker.session.s3_input(train_data, content_type='text/csv')
valid_channel = sagemaker.session.s3_input(validation_data, content_type='text/csv')
data_channels = {'train': train_channel, 'validation': valid_channel}

s3_output_location = 's3://{}/{}/{}'.format(bucket, prefix_output, 'xgboost_model')
xgb_model = sagemaker.estimator.Estimator(container,role,train_instance_count=1,train_instance_type='ml.m4.xlarge',train_volume_size = 1,output_path=s3_output_location,sagemaker_session=sagemaker.Session())
xgb_model.set_hyperparameters(max_depth = 7,eta = 0.01,gamma = 0.1,min_child_weight = 1,num_round = 5000,eval_metric = 'rmse',objective='reg:linear')

xgb_model.fit(inputs=data_channels, logs=True)
#xgb_model.fit({'train':train_channel})

# Deploy the model

In [None]:
xgb_predictor = xgb_model.deploy(initial_instance_count=1,content_type='text/csv',instance_type='ml.t2.medium',endpoint_name='endpoint-v1')


In [None]:
# download test data
import boto3
import pandas as pd
import numpy as np
bucket = 'toronto-house-price-project'
file_name = 'input/test.csv'
s3 = boto3.resource('s3')
obj = s3.Object(bucket, file_name)
test_data_array = pd.read_csv(obj.get()['Body'],header=None)
test_data_array = np.array(test_data_array)

In [None]:
test_data=test_data_array[1] # select a row as test data

In [None]:
from sagemaker.predictor import csv_serializer
xgb_predictor.content_type = 'text/csv' # set the data type for an inference

xgb_predictor.serializer = csv_serializer # set the serializer type

predictions = xgb_predictor.predict(test_data).decode('utf-8') # predict!

#predictions_array = np.fromstring(predictions[1:], sep=',') # and turn the prediction into an array

print(predictions)

# terminate end point

In [None]:
sagemaker.Session().delete_endpoint(xgb_predictor_1.endpoint)

# Batch transform

In [None]:
# The location of the test dataset
batch_input = 's3://{}/{}/test.csv'.format(bucket, prefix)
# The location to store the results of the batch transform job
batch_output = 's3://{}/{}/batch-inference'.format(bucket, 'output')
transformer = xgb_model.transformer(instance_count=1, instance_type='ml.m4.xlarge',output_path=batch_output)
transformer.transform(data=batch_input, content_type='text/csv')
transformer.wait()

# create endpoint from existing model artifacts

## retreve model from s3

In [None]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
import os

# Define IAM role
import boto3
import re
from sagemaker import get_execution_role
import sagemaker
from sagemaker.amazon.amazon_estimator import get_image_uri
#from sagemaker.amazon.amazon_estimator import get_image_uri
#image_name = get_image_uri(boto3.Session().region_name, 'xgboost')
container = get_image_uri(boto3.Session().region_name, 'xgboost',repo_version='1.0-1') # built-in xgboost
s3_model_location = r's3://toronto-house-price-project/models/xgboost_model/sagemaker-xgboost-2020-10-19-17-51-06-651/output/model.tar.gz'
role = get_execution_role()
sess = sagemaker.Session()
xgb_model_1 = sagemaker.model.Model (model_data = s3_model_location, 
                               image = container,
                               role = role,
                               sagemaker_session = sess)


## deploy the model to endpoint

In [None]:
# Specify endpoint instance type and count
#deploy endpoint
xgb_model_1.deploy(initial_instance_count = 1,instance_type = 'ml.t2.medium',endpoint_name='endpoint-xgboost') # need to change endpoint name every time, otherwise the name will conflict with the existing one



## create a real time predictor with the deployed endpoint

In [None]:
endpoint_name='endpoint-v3'
xgb_predictor_1=sagemaker.predictor.RealTimePredictor(endpoint=endpoint_name)

## load test data from s3

In [None]:
# download test data
import boto3
import pandas as pd
import numpy as np
bucket = 'toronto-house-price-project'
file_name = 'input/test.csv'
s3 = boto3.resource('s3')
obj = s3.Object(bucket, file_name)
test_data_array = pd.read_csv(obj.get()['Body'],header=None)
test_data_array = np.array(test_data_array)
test_data=test_data_array[1046]

## predict with endpoint

In [None]:
from sagemaker.predictor import csv_serializer
xgb_predictor_1.content_type = 'text/csv' # set the data type for an inference

xgb_predictor_1.serializer = csv_serializer # set the serializer type

predictions = xgb_predictor_1.predict(test_data).decode('utf-8') # predict!

#predictions_array = np.fromstring(predictions[1:], sep=',') # and turn the prediction into an array

print(predictions)

# Hyperparameters tuning

In [None]:
import sagemaker
import boto3
from sagemaker.predictor import csv_serializer    # Converts strings for HTTP POST requests on inference

import numpy as np                                # For performing matrix operations and numerical processing
import pandas as pd                               # For manipulating tabular data
from time import gmtime, strftime                 
import os 
 
region = boto3.Session().region_name    
smclient = boto3.Session().client('sagemaker')
from sagemaker import get_execution_role

role = get_execution_role()
print(role)
bucket = 'toronto-house-price-project'
prefix = 'sagemaker/DEMO-automatic-model-tuning-xgboost-dm'

In [None]:
tuning_job_config = {
    "ParameterRanges": {
      "CategoricalParameterRanges": [],
      "ContinuousParameterRanges": [
        {
          "MaxValue": "1",
          "MinValue": "0",
          "Name": "eta"
        },
        {
          "MaxValue": "2",
          "MinValue": "0",
          "Name": "alpha"
        },
        {
          "MaxValue": "10",
          "MinValue": "1",
          "Name": "min_child_weight"
        }
      ],
      "IntegerParameterRanges": [
        {
          "MaxValue": "10",
          "MinValue": "1",
          "Name": "max_depth"
        }
      ]
    },
    "ResourceLimits": {
      "MaxNumberOfTrainingJobs": 20,
      "MaxParallelTrainingJobs": 3
    },
    "Strategy": "Bayesian",
    "HyperParameterTuningJobObjective": {
      "MetricName": "validation:mape",
      "Type": "Maximize"
    }
  }

In [None]:
from sagemaker.amazon.amazon_estimator import get_image_uri
training_image = get_image_uri(boto3.Session().region_name, 'xgboost')

s3_input_train = 's3://{}/{}/train'.format(bucket, prefix)
s3_input_validation ='s3://{}/{}/validation/'.format(bucket, prefix)
     
training_job_definition = {
    "AlgorithmSpecification": {
      "TrainingImage": training_image,
      "TrainingInputMode": "File"
    },
    "InputDataConfig": [
      {
        "ChannelName": "train",
        "CompressionType": "None",
        "ContentType": "csv",
        "DataSource": {
          "S3DataSource": {
            "S3DataDistributionType": "FullyReplicated",
            "S3DataType": "S3Prefix",
            "S3Uri": s3_input_train
          }
        }
      },
      {
        "ChannelName": "validation",
        "CompressionType": "None",
        "ContentType": "csv",
        "DataSource": {
          "S3DataSource": {
            "S3DataDistributionType": "FullyReplicated",
            "S3DataType": "S3Prefix",
            "S3Uri": s3_input_validation
          }
        }
      }
    ],
    "OutputDataConfig": {
      "S3OutputPath": "s3://{}/{}/output".format(bucket,prefix)
    },
    "ResourceConfig": {
      "InstanceCount": 2,
      "InstanceType": "ml.c4.2xlarge",
      "VolumeSizeInGB": 10
    },
    "RoleArn": role,
    "StaticHyperParameters": {
      "eval_metric": "auc",
      "num_round": "100",
      "objective": "binary:logistic",
      "rate_drop": "0.3",
      "tweedie_variance_power": "1.4"
    },
    "StoppingCondition": {
      "MaxRuntimeInSeconds": 43200
    }
}

In [None]:
tuning_job_name = "MyTuningJob"
smclient.create_hyper_parameter_tuning_job(HyperParameterTuningJobName = tuning_job_name,
                                           HyperParameterTuningJobConfig = tuning_job_config,
                                           TrainingJobDefinition = training_job_definition)