In [1]:
import configparser

In [2]:
# Create a ConfigParser object
config = configparser.ConfigParser()

# Read the configuration file
config.read('aws_config.cfg')

['aws_config.cfg']

In [3]:
# Access the credentials and region
aws_access_key_id = config['default']['aws_access_key_id']
aws_secret_access_key = config['default']['aws_secret_access_key']
region = config['default']['region']

In [4]:
import sagemaker
import boto3
from sagemaker.amazon.amazon_estimator import get_image_uri 
#alternative
from sagemaker.amazon.amazon_estimator import image_uris
from sagemaker.session import s3_input, Session



sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [5]:
bucket_name = 'dummy-dummy-bankapp' # bucket name
my_region = boto3.session.Session().region_name # Check the region of the instance
print(my_region)

us-east-1


In [6]:
# AKIA5FTZAES5XGYZ7SYZ 
# vEV0T5vFO2XPkrevjSI0TBKMUYscjTaBM0Y9C9I+ 

s3 = boto3.resource('s3',region_name=region, 
                  aws_access_key_id=aws_access_key_id, 
                  aws_secret_access_key=aws_secret_access_key)
try:
    if  my_region == 'us-east-1':
        s3.create_bucket(Bucket=bucket_name)
    print('S3 bucket created successfully')
except Exception as e:
    print('S3 error: ',e)

S3 bucket created successfully


In [7]:
prefix = 'xgboost-as-a-built-in-algo'
output_path ='s3://{}/{}/output'.format(bucket_name, prefix)
print(output_path)

s3://dummy-dummy-bankapp/xgboost-as-a-built-in-algo/output


In [8]:
import pandas as pd
import urllib
try:
    #the bank data is in one hot encoded format already
    urllib.request.urlretrieve ("https://d1.awsstatic.com/tmt/build-train-deploy-machine-learning-model-sagemaker/bank_clean.27f01fbbdf43271788427f3682996ae29ceca05d.csv", "bank_clean.csv")
    print('Success: downloaded bank_clean.csv.')
except Exception as e:
    print('Data load error: ',e)
try:
    model_data = pd.read_csv('./bank_clean.csv',index_col=0)
    print('Success: Data loaded into dataframe.')
except Exception as e:
    print('Data load error: ',e)

Success: downloaded bank_clean.csv.
Success: Data loaded into dataframe.


In [9]:
import numpy as np
train_data, test_data = np.split(model_data.sample(frac=1, random_state=1729), [int(0.7 * len(model_data))])
print(train_data.shape, test_data.shape)

(28831, 61) (12357, 61)


In [10]:
### Saving Train And Test Into Buckets
## We start with Train Data
import os
#format in which data is stored is label and then independent columns
pd.concat([train_data['y_yes'], train_data.drop(['y_no', 'y_yes'], 
                                                axis=1)], 
                                                axis=1).to_csv('train.csv', index=False, header=False)
#uploading train.csv in the bucket
boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv')

#now storing the training csv into a variable 
s3_input_train = sagemaker.TrainingInput(s3_data='s3://{}/{}/train'.format(bucket_name, prefix), content_type='csv')


In [11]:

# Test Data Into Buckets
pd.concat([test_data['y_yes'], test_data.drop(['y_no', 'y_yes'], axis=1)], axis=1).to_csv('test.csv', index=False, header=False)

boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'test/test.csv')).upload_file('test.csv')

s3_input_test = sagemaker.TrainingInput(s3_data='s3://{}/{}/test'.format(bucket_name, prefix), content_type='csv')

In [12]:
container = image_uris.retrieve('xgboost',boto3.Session().region_name,'1.5-1')


# initialize hyperparameters
hyperparameters = {
        "max_depth":"5",
        "eta":"0.2",
        "gamma":"4",
        "min_child_weight":"6",
        "subsample":"0.7",
        "objective":"binary:logistic",
        "num_round":50
        }


estimator = sagemaker.estimator.Estimator(image_uri=container, 
                                          hyperparameters=hyperparameters,
                                          role=sagemaker.get_execution_role(),
                                          instance_count=1, 
                                          instance_type='ml.m5.2xlarge', 
                                          volume_size=5, # 5 GB 
                                          output_path=output_path)

estimator.fit({'train': s3_input_train,'validation': s3_input_test})


2025-01-31 05:58:55 Starting - Starting the training job...
..25-01-31 05:59:09 Starting - Preparing the instances for training.
..25-01-31 05:59:48 Downloading - Downloading the training image.
..25-01-31 06:00:23 Training - Training image download completed. Training in progress..
  from pandas import MultiIndex, Int64Index[0m
[34m[2025-01-31 06:00:44.227 ip-10-0-143-155.ec2.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2025-01-31 06:00:44.249 ip-10-0-143-155.ec2.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2025-01-31:06:00:44:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2025-01-31:06:00:44:INFO] Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34m[2025-01-31:06:00:44:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2025-01-31:06:00:44:INFO] Running XGBoost Sagemaker in algorithm mode[0m
[34m[2025-01-31:06:0

In [13]:
from sagemaker.serializers import CSVSerializer
xgb_predictor = estimator.deploy(initial_instance_count=1,instance_type='ml.m5.2xlarge',serializer=CSVSerializer())

------!

In [14]:
test_data_array = test_data.drop(['y_no', 'y_yes'], axis=1).values #load the data into an array

print(test_data_array.shape)

#xgb_predictor.content_type = 'text/csv' # set the data type for an inference

#xgb_predictor.serializer = CSVSerializer() # set the serializer type

predictions = xgb_predictor.predict(test_data_array).decode('utf-8') # predict!

(12357, 59)


In [15]:
predictions_array = np.fromstring(predictions[1:], sep='\n') # and turn the prediction into an array
print(predictions_array.shape)

(12357,)


In [16]:
import sklearn.metrics

cutoff=0.5
print(sklearn.metrics.confusion_matrix(test_data['y_yes'],np.round(predictions_array)))
print(sklearn.metrics.classification_report(test_data['y_yes'],np.round(predictions_array)))

[[10785   151]
 [ 1124   297]]
              precision    recall  f1-score   support

           0       0.91      0.99      0.94     10936
           1       0.66      0.21      0.32      1421

    accuracy                           0.90     12357
   macro avg       0.78      0.60      0.63     12357
weighted avg       0.88      0.90      0.87     12357



In [17]:
xgb_predictor.end_point

In [19]:
import boto3
import json

tst='29,2,999,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0'

runtime = boto3.Session().client('sagemaker-runtime',region_name=region, 
                  aws_access_key_id=aws_access_key_id, 
                  aws_secret_access_key=aws_secret_access_key)



response = runtime.invoke_endpoint(
    EndpointName='sagemaker-xgboost-2025-01-31-06-02-23-124', ContentType="text/csv", Body=tst
)
result = response["Body"].read().decode("ascii")
print("Predicted Class Probabilities: {}.".format(result))


Predicted Class Probabilities: 0.05214285850524902
.


In [20]:
#name of the buckets in your account

s3 = boto3.client('s3', region_name=region, 
                  aws_access_key_id=aws_access_key_id, 
                  aws_secret_access_key=aws_secret_access_key)

response = s3.list_buckets()

# Output the bucket names
print('Existing buckets:')
for bucket in response['Buckets']:
    print(f'  {bucket["Name"]}')


Existing buckets:
  dummy-dummy-bankapp
