# Steps:

1. Import necessary libraries
2. Creating S3 bucket
3. mapping train and test data in S3
4. mapping the path of the models in s3

In [2]:
import sagemaker # using builtin algorithm
import boto3 # read from s3 bucket even from local if its public
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.session import s3_input, Session

In [3]:
bucket_name="bankapplicationnnn" # give any unique name for your bucket
my_region=boto3.session.Session().region_name # set the region of the instance
print(my_region)

us-east-2


In [30]:
#create s3 bucket using code, we can also create manually

s3= boto3.resource("s3")
try:
    if my_region == "us-east-2":
        s3.create_bucket(Bucket=bucket_name, CreateBucketConfiguration={"LocationConstraint": my_region})
    print("S3 bucket created successfully")
except Exception as e:
    print("S3 error: ",e)

S3 bucket created successfully


In [4]:
# set an output path where the trained model will be saved

prefix="xgboost-as-a-built-in-algo"
output_path="s3://{}/{}/output".format(bucket_name,prefix)
print(output_path)

s3://bankapplicationnnn/xgboost-as-a-built-in-algo/output


# Downloading and storing dataset in s3 bucket

In [8]:
import pandas as pd
import urllib

try:
    urllib.request.urlretrieve("https://d1.awsstatic.com/tmt/build-train-deploy-machine-learning-model-sagemaker/bank_clean.27f01fbbdf43271788427f3682996ae29ceca05d.csv", "bank_clean.csv")
    print("Success: downloaded bank_clean.csv.")
except Exception as e:
    print("Data Error: ",e)

try:
    model_data=pd.read_csv("./bank_clean.csv", index_col=0)
    print("Success: data loaded in dataframe")
except Exception as e:
    print("Data Error: ",e)

Success: downloaded bank_clean.csv.
Success: data loaded in dataframe


In [9]:
model_data.head()

Unnamed: 0,age,campaign,pdays,previous,no_previous_contact,not_working,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,...,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_nonexistent,poutcome_success,y_no,y_yes
0,56,1,999,0,1,0,0,0,0,1,...,0,1,0,0,0,0,1,0,1,0
1,57,1,999,0,1,0,0,0,0,0,...,0,1,0,0,0,0,1,0,1,0
2,37,1,999,0,1,0,0,0,0,0,...,0,1,0,0,0,0,1,0,1,0
3,40,1,999,0,1,0,1,0,0,0,...,0,1,0,0,0,0,1,0,1,0
4,56,1,999,0,1,0,0,0,0,0,...,0,1,0,0,0,0,1,0,1,0


In [10]:
model_data.shape

(41188, 61)

In [11]:
# train test split

import numpy as np
train_data,test_data=np.split(model_data.sample(frac=1, random_state=1729),[int(0.7*len(model_data))])

In [13]:
train_data.shape, test_data.shape

((28831, 61), (12357, 61))

In [22]:
# save data

# In sagemaker, dependent variable should be used first

import os
pd.concat([train_data["y_yes"], train_data.drop(["y_yes","y_no"], axis=1)], axis=1).to_csv("train.csv", index=False, header=False)

boto3.Session().resource("s3").Bucket(bucket_name).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv')
s3_input_train=sagemaker.TrainingInput(s3_data="s3://{}/{}/train".format(bucket_name,prefix), content_type="csv")

In [25]:
s3_input_train

<sagemaker.inputs.TrainingInput at 0x7f89aae44a20>

In [26]:
import os
pd.concat([test_data["y_yes"], test_data.drop(["y_yes","y_no"], axis=1)], axis=1).to_csv("test.csv", index=False, header=False)

boto3.Session().resource("s3").Bucket(bucket_name).Object(os.path.join(prefix, 'test/test.csv')).upload_file('test.csv')
s3_input_test=sagemaker.TrainingInput(s3_data="s3://{}/{}/test".format(bucket_name,prefix), content_type="csv")

In [27]:
s3_input_test

<sagemaker.inputs.TrainingInput at 0x7f89aa953b38>

# Builing pre built xgboost algorithm

In [28]:
# inbuilt alogithm are always in form of container, we have to get those container first

container= get_image_uri(boto3.Session().region_name, "xgboost", repo_version="1.0-1")

The method get_image_uri has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [42]:
# initialize hyper parameter

hyperparameters={
    "max_depth":"5",
    "eta":"0.2",
    "gamma":"4",
    "min_child_weight":"6",
    "num_round":"6",
    "subsample":"0.7",
    "objective":"binary:logistic" # reg:linear , it is regression
}

In [43]:
# create a sagemaker estimator that calls the xgboost container

estimator = sagemaker.estimator.Estimator(image_uri=container,
                                          hyperparameters=hyperparameters,
                                          role=sagemaker.get_execution_role(),  # I am role
                                          instance_type = 'ml.m5.2xlarge', # small charge
                                          instance_count= 1,
                                          volume_size = 5, #5 GB
                                          output_path=output_path,
                                           use_spot_instances=True, # reduce billing time
                                           max_run=300, # reduce billing time
                                           max_wait= 600 # reduce billing time
                                         )

In [44]:
# Training: will take some time

estimator.fit({"train":s3_input_train, "validation": s3_input_test})

2021-10-22 07:22:39 Starting - Starting the training job...
2021-10-22 07:23:03 Starting - Launching requested ML instancesProfilerReport-1634887359: InProgress
......
2021-10-22 07:24:03 Starting - Preparing the instances for training............
2021-10-22 07:26:04 Downloading - Downloading input data
2021-10-22 07:26:04 Training - Downloading the training image..[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34m[07:26:20] 28831x59 matrix with 1701

# Deploy the model

In [45]:
xgb_predictor=estimator.deploy(initial_instance_count=1, instance_type="ml.m4.xlarge")

-----!

## Prediction of test data

In [48]:
from sagemaker.predictor import csv_serializer
test_data_array=test_data.drop(["y_no","y_yes"], axis=1).values # load data into array
#xgb_predictor.content_type = 'text/csv' # set the data type for an inference
xgb_predictor.serializer= csv_serializer # set the serializer type
predictions= xgb_predictor.predict(test_data_array).decode("utf-8") # predict
predictions_array = np.fromstring(predictions[1:], sep=",")
print(predictions_array.shape)

The csv_serializer has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


(12357,)


In [49]:
predictions_array

array([0.17334226, 0.17334226, 0.17334226, ..., 0.16913538, 0.16913538,
       0.16913538])

In [72]:
cm = pd.crosstab(index=test_data["y_yes"], columns=np.round(predictions_array), rownames=["Observed"], colnames=["Predicted"])
tn=cm.iloc[0,0]; fn=cm.iloc[1,0]; tp=cm.iloc[1,1]; fp=cm.iloc[0,1]; p=(tp+tn)/(tp+tn+fp+fn)*100
print("Overall Classification rate: {}\n".format(p))
print("{0:<15}{1:<15}{2:>8}".format("Predicted", "No Purchase", "Purchase"))
print("Observed")
print("No Purchase   {} ({})   {} ({})".format(np.round(tn/(tn+fn)*100,1),tn, np.round(fp/(tp+fp)*100,1),fp))
print("Purchase      {} ({})     {} ({})".format(np.round(fn/(tn+fn)*100,1),fn, np.round(tp/(tp+fp)*100,1),tp))

Overall Classification rate: 89.73860969490977

Predicted      No Purchase    Purchase
Observed
No Purchase   90.4 (10823)   29.8 (113)
Purchase      9.6 (1155)     70.2 (266)


In [73]:
# deleting the endpoint to avoid charges


In [74]:
sagemaker.Session().delete_endpoint(xgb_predictor.endpoint)
bucket_to_delete = boto3.resource("s3").Bucket(bucket_name)
bucket_to_delete.objects.all().delete()

The endpoint attribute has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


[{'ResponseMetadata': {'RequestId': 'CAZDQ6SVKWSGMT5Z',
   'HostId': '1ALbjXmzEw8BBlL6qUFySuoyrltj4j/MGdy5pkdw2tWVVC1a936i7FHIVXDMiwM4HSiiPlgNKDo=',
   'HTTPStatusCode': 200,
   'HTTPHeaders': {'x-amz-id-2': '1ALbjXmzEw8BBlL6qUFySuoyrltj4j/MGdy5pkdw2tWVVC1a936i7FHIVXDMiwM4HSiiPlgNKDo=',
    'x-amz-request-id': 'CAZDQ6SVKWSGMT5Z',
    'date': 'Fri, 22 Oct 2021 08:28:17 GMT',
    'content-type': 'application/xml',
    'transfer-encoding': 'chunked',
    'server': 'AmazonS3',
    'connection': 'close'},
   'RetryAttempts': 0},
  'Deleted': [{'Key': 'xgboost-as-a-built-in-algo/output/sagemaker-xgboost-2021-10-22-07-22-39-598/profiler-output/framework/training_job_end.ts'},
   {'Key': 'xgboost-as-a-built-in-algo/output/sagemaker-xgboost-2021-10-22-07-22-39-598/profiler-output/system/incremental/2021102207/1634887500.algo-1.json'},
   {'Key': 'xgboost-as-a-built-in-algo/train/train.csv'},
   {'Key': 'xgboost-as-a-built-in-algo/output/sagemaker-xgboost-2021-10-22-07-22-39-598/rule-output/Prof