In [1]:
import os

import sagemaker
import boto3
import botocore
import time
import uuid
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.session import s3_input, Session

import sklearn
from sklearn.model_selection import train_test_split

  from pandas.core.computation.check import NUMEXPR_INSTALLED


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [2]:
bucket_name = 'exampleproject'
my_region = botocore.session.Session().get_config_variable('region')
print(my_region) # this is where we set the region to London

eu-west-2


In [3]:
# Generate a unique bucket name by appending a timestamp and a random UUID
bucket_name = f'exampleproject-{int(time.time())}-{str(uuid.uuid4())[:8]}'

my_region = 'eu-west-2'  # Set the desired region explicitly
print(f"Selected region: {my_region}")

s3 = boto3.resource('s3', region_name=my_region)

try:
    if my_region == boto3.session.Session().region_name:
        # Specify the location constraint based on the selected region
        location_constraint = my_region

        s3.create_bucket(
            Bucket=bucket_name,
            CreateBucketConfiguration={'LocationConstraint': location_constraint}
        )
        print(f"S3 bucket {bucket_name} created successfully")
    else:
        print("Region mismatch.")
except Exception as e:
    print(f"S3 error: {e}")

Selected region: eu-west-2
S3 bucket exampleproject-1701781738-fa188549 created successfully


In [4]:
# Set the output path to the location where the trained model will be saved
prefix = 'xgboost-as-a-built-in-algo'
output_path = 's3://{}/{}/output'.format(bucket_name, prefix)
print(output_path)

# As you can see, now every time I train the model, a new instance will be created in the bucket, and then we can
# easily see how the model evolves as we tune the hyperparameters, modify the data etc.

s3://exampleproject-1701781738-fa188549/xgboost-as-a-built-in-algo/output


## Downloading the dataset and storing in S3 bucket

In [5]:
import pandas as pd
import numpy as np
import urllib

try:
    urllib.request.urlretrieve ("https://d1.awsstatic.com/tmt/build-train-deploy-machine-learning-model-sagemaker/bank_clean.27f01fbbdf43271788427f3682996ae29ceca05d.csv", "bank_clean.csv")
    print(f"Success: data downloaded bank_clean.csv.")
except Exception as e:
    print(f"Data loading error: {e}.")
    
try:
    model_data = pd.read_csv('./bank_clean.csv', index_col = 0)
    print(f"Success: data loaded into dataframe.")
except Exception as e:
    print(f"Data loading error: {e}.")

Success: data downloaded bank_clean.csv.
Success: data loaded into dataframe.


In [6]:
model_data.head(10)

Unnamed: 0,age,campaign,pdays,previous,no_previous_contact,not_working,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,...,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_nonexistent,poutcome_success,y_no,y_yes
0,56,1,999,0,1,0,0,0,0,1,...,0,1,0,0,0,0,1,0,1,0
1,57,1,999,0,1,0,0,0,0,0,...,0,1,0,0,0,0,1,0,1,0
2,37,1,999,0,1,0,0,0,0,0,...,0,1,0,0,0,0,1,0,1,0
3,40,1,999,0,1,0,1,0,0,0,...,0,1,0,0,0,0,1,0,1,0
4,56,1,999,0,1,0,0,0,0,0,...,0,1,0,0,0,0,1,0,1,0
5,45,1,999,0,1,0,0,0,0,0,...,0,1,0,0,0,0,1,0,1,0
6,59,1,999,0,1,0,1,0,0,0,...,0,1,0,0,0,0,1,0,1,0
7,41,1,999,0,1,0,0,1,0,0,...,0,1,0,0,0,0,1,0,1,0
8,24,1,999,0,1,0,0,0,0,0,...,0,1,0,0,0,0,1,0,1,0
9,25,1,999,0,1,0,0,0,0,0,...,0,1,0,0,0,0,1,0,1,0


In [7]:
model_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 41188 entries, 0 to 41187
Data columns (total 61 columns):
 #   Column                         Non-Null Count  Dtype
---  ------                         --------------  -----
 0   age                            41188 non-null  int64
 1   campaign                       41188 non-null  int64
 2   pdays                          41188 non-null  int64
 3   previous                       41188 non-null  int64
 4   no_previous_contact            41188 non-null  int64
 5   not_working                    41188 non-null  int64
 6   job_admin.                     41188 non-null  int64
 7   job_blue-collar                41188 non-null  int64
 8   job_entrepreneur               41188 non-null  int64
 9   job_housemaid                  41188 non-null  int64
 10  job_management                 41188 non-null  int64
 11  job_retired                    41188 non-null  int64
 12  job_self-employed              41188 non-null  int64
 13  job_services         

In [8]:
model_data.describe()

Unnamed: 0,age,campaign,pdays,previous,no_previous_contact,not_working,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,...,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_nonexistent,poutcome_success,y_no,y_yes
count,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,...,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0
mean,40.02406,2.567593,962.475454,0.172963,0.963217,0.087623,0.253035,0.224677,0.03535,0.025736,...,0.190031,0.206711,0.209357,0.196416,0.197485,0.103234,0.863431,0.033335,0.887346,0.112654
std,10.42125,2.770014,186.910907,0.494901,0.18823,0.282749,0.434756,0.417375,0.184665,0.158348,...,0.39233,0.404951,0.406855,0.397292,0.398106,0.304268,0.343396,0.179512,0.316173,0.316173
min,17.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,32.0,1.0,999.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
50%,38.0,2.0,999.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
75%,47.0,3.0,999.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
max,98.0,56.0,999.0,7.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [9]:
X = model_data.iloc[:,:-2]
y = model_data.iloc[:,-1:]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 123)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(28831, 59) (12357, 59) (28831, 1) (12357, 1)


In [10]:
# Create the .csv files containing the training and testing sets
train_data = pd.concat([y_train, X_train], axis = 1).to_csv('train_data.csv', index = False, header = False)
test_data = pd.concat([y_test, X_test], axis = 1).to_csv('test_data.csv', index = False, header = False)

In [11]:
# Save the train data into the bucket as a csv file
boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'train_data/train_data.csv')).upload_file('train_data.csv')
s3_input_train = sagemaker.TrainingInput(s3_data = 's3://{}/{}/train'.format(bucket_name, prefix), content_type = 'csv')

In [12]:
# Save the test data into the bucket as a csv file
boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'test_data/test_data.csv')).upload_file('test_data.csv')
s3_input_test = sagemaker.TrainingInput(s3_data = 's3://{}/{}/test'.format(bucket_name, prefix), content_type = 'csv')

## Build and train the XGBoost model

In [13]:
from sagemaker import get_execution_role, Session
from sagemaker.estimator import Estimator
from sagemaker.image_uris import retrieve

# Specify the XGBoost framework and region
framework = 'xgboost'
region = 'eu-west-2'

# Get the supported versions for XGBoost
supported_versions = [retrieve(framework, region=region, version=version) for version in ['1', '0.90-1', '0.90-2', '1.0-1', '1.2-1', '1.2-2', '1.3-1', '1.5-1', '1.7-1', 'latest']]

print("Supported XGBoost versions:", supported_versions)

# Assuming you want to get the URI for a specific algorithm, for example, xgboost
container_uri = '764974769150.dkr.ecr.eu-west-2.amazonaws.com/sagemaker-xgboost:0.90-1-cpu-py3'

sagemaker_session = Session()

# Define other parameters
role = get_execution_role()

Supported XGBoost versions: ['644912444149.dkr.ecr.eu-west-2.amazonaws.com/xgboost:1', '764974769150.dkr.ecr.eu-west-2.amazonaws.com/sagemaker-xgboost:0.90-1-cpu-py3', '764974769150.dkr.ecr.eu-west-2.amazonaws.com/sagemaker-xgboost:0.90-2-cpu-py3', '764974769150.dkr.ecr.eu-west-2.amazonaws.com/sagemaker-xgboost:1.0-1-cpu-py3', '764974769150.dkr.ecr.eu-west-2.amazonaws.com/sagemaker-xgboost:1.2-1', '764974769150.dkr.ecr.eu-west-2.amazonaws.com/sagemaker-xgboost:1.2-2', '764974769150.dkr.ecr.eu-west-2.amazonaws.com/sagemaker-xgboost:1.3-1', '764974769150.dkr.ecr.eu-west-2.amazonaws.com/sagemaker-xgboost:1.5-1', '764974769150.dkr.ecr.eu-west-2.amazonaws.com/sagemaker-xgboost:1.7-1', '644912444149.dkr.ecr.eu-west-2.amazonaws.com/xgboost:latest']
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK d

In [14]:
hyperparameters = {
    "max_depth":"5",
    "eta":"0.2",
    "gamma":"4",
    "min_child_weight":"6",
    "subsample":"0.7",
    "objective":"binary:logistic",
    "num_round": 20
}

In [15]:
estimator = Estimator(image_uri = container_uri,
                     hyperparameters = hyperparameters,
                     role = role,
                     instance_count = 1,
                     instance_type = "ml.m5.large",
                     volume_size = 5,
                     output_path = output_path,
                     use_spot_instances = True,
                     max_run = 300,
                     max_wait = 600,
                     sagemaker_session = sagemaker_session)

estimator.set_hyperparameters(**hyperparameters)

In [16]:
estimator.fit({'train': s3_input_train, 'validation': s3_input_test})

INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2023-12-05-13-09-01-237


2023-12-05 13:09:01 Starting - Starting the training job...
2023-12-05 13:09:17 Starting - Preparing the instances for training.........
2023-12-05 13:10:48 Downloading - Downloading input data......
2023-12-05 13:11:33 Training - Downloading the training image..[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34m[13:12:09] 28831x59 matrix with 1701029 entries loaded from /opt/ml/input/data/train/train_data?format=csv&label_column=0&delimiter=,[0m
[3

## Deploy the XGBoost Model

In [17]:
xgb_predictions = estimator.deploy(initial_instance_count = 1, instance_type = 'ml.m4.xlarge')

INFO:sagemaker:Creating model with name: sagemaker-xgboost-2023-12-05-13-12-43-593
INFO:sagemaker:Creating endpoint-config with name sagemaker-xgboost-2023-12-05-13-12-43-593
INFO:sagemaker:Creating endpoint with name sagemaker-xgboost-2023-12-05-13-12-43-593


----!

## Predictions using the test data

In [34]:
from sagemaker.serializers import CSVSerializer

test_data_array = test_data.iloc[:,1:].values
csv_serializer = CSVSerializer()
payload = csv_serializer.serialize(test_data_array)

xgb_predictions.content_type = 'text/csv'
xgb_predictions.serializer = csv_serializer
predictions = xgb_predictions.predict(payload).decode('utf-8')
predictions_array = np.fromstring(predictions[1:], sep = ',')
print(predictions_array.shape)

(12356,)
(12356,)


In [36]:
cm = pd.crosstab(index=test_data.iloc[:,0], columns=np.round(predictions_array), rownames=['Observed'], colnames=['Predicted'])
tn = cm.iloc[0,0]; fn = cm.iloc[1,0]; tp = cm.iloc[1,1]; fp = cm.iloc[0,1]; p = (tp+tn)/(tp+tn+fp+fn)*100
print("\n{0:<20}{1:<4.1f}%\n".format("Overall Classification Rate: ", p))
print("{0:<15}{1:<15}{2:>8}".format("Predicted", "No Purchase", "Purchase"))
print("Observed")
print("{0:<15}{1:<2.0f}% ({2:<}){3:>6.0f}% ({4:<})".format("No Purchase", tn/(tn+fn)*100,tn, fp/(tp+fp)*100, fp))
print("{0:<16}{1:<1.0f}% ({2:<}){3:>7.0f}% ({4:<}) \n".format("Purchase", fn/(tn+fn)*100,fn, tp/(tp+fp)*100, tp))


Overall Classification Rate: 90.0%

Predicted      No Purchase    Purchase
Observed
No Purchase    91% (10843)    29% (117)
Purchase        9% (1116)     71% (280) 



## Delete the endpoint

In [38]:
sagemaker.Session().delete_endpoint(xgb_predictions.endpoint)
bucket_to_delete = boto3.resource('s3').Bucket(bucket_name)
bucket_to_delete.objects.all().delete()

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
INFO:sagemaker:Deleting endpoint with name: sagemaker-xgboost-2023-12-05-13-12-43-593


[{'ResponseMetadata': {'RequestId': 'J8H29MJSHF906FNP',
   'HostId': '4pQeDlSogM0IN82T9urCQIPlBUcmrVA8mnPBJUUk333C+u6X9nucP3YlHCMYuJhIS5nnJJIOWh0=',
   'HTTPStatusCode': 200,
   'HTTPHeaders': {'x-amz-id-2': '4pQeDlSogM0IN82T9urCQIPlBUcmrVA8mnPBJUUk333C+u6X9nucP3YlHCMYuJhIS5nnJJIOWh0=',
    'x-amz-request-id': 'J8H29MJSHF906FNP',
    'date': 'Tue, 05 Dec 2023 13:26:24 GMT',
    'content-type': 'application/xml',
    'transfer-encoding': 'chunked',
    'server': 'AmazonS3',
    'connection': 'close'},
   'RetryAttempts': 0},
  'Deleted': [{'Key': 'xgboost-as-a-built-in-algo/output/sagemaker-xgboost-2023-12-05-13-09-01-237/output/model.tar.gz'},
   {'Key': 'xgboost-as-a-built-in-algo/train_data/train_data.csv'},
   {'Key': 'xgboost-as-a-built-in-algo/output/sagemaker-xgboost-2023-12-05-13-09-01-237/profiler-output/system/incremental/2023120513/1701781920.algo-1.json'},
   {'Key': 'xgboost-as-a-built-in-algo/output/sagemaker-xgboost-2023-12-05-13-09-01-237/profiler-output/system/training_