In [83]:
import sagemaker
import boto3
from sagemaker import image_uris
from sagemaker.session import s3_input, Session
from sagemaker.serializers import CSVSerializer
import pandas as pd
import numpy as np
import os

In [2]:
my_region = boto3.session.Session().region_name
print(my_region)

us-east-1


In [3]:
s3 = boto3.resource('s3')
bucket_name = 'bank-churn-bucket'

buckets = [bucket.name for bucket in s3.buckets.all()]

if bucket_name in buckets:
    print("Bucket exists!")
else:
    print("Bucket does not exist.")

Bucket exists!


In [7]:
prefix = 'decision-tree-as-a-built-in-algo'
output_path = f's3://{bucket_name}/{prefix}/output'
print(output_path)

s3://bank-churn-bucket/decision-tree-as-a-built-in-algo/output


##### Upload the data files from the local machine

In [48]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [49]:
train_df.drop(columns=['id', 'CustomerId', 'Surname', 'Geography', 'Gender'], inplace=True)
test_df.drop(columns=['id', 'CustomerId', 'Surname', 'Geography', 'Gender'], inplace=True)

In [54]:
train_df = train_df[:25000]
test_df = test_df[:10000]

In [55]:
# In AWS, the target variable must be the first feature.
train_df = train_df[['Exited', 'CreditScore', 'Age', 'Tenure', 'Balance', 
                     'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary']]
train_df.to_csv('train.csv', index=False, header=False)

test_df = test_df[['Exited', 'CreditScore', 'Age', 'Tenure', 'Balance', 
                    'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary']]
test_df.to_csv('test.csv', index=False, header=False)

##### Saving data files in the 3 bucket
- Go to the AWS Console → IAM → Roles
- Search for: AmazonSageMaker-ExecutionRole-20250411T191722
- Click on the role, then click Attach policies
- Attach the managed policy: AmazonS3FullAccess

In [72]:
boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'data/train.csv')).upload_file('train.csv')
s3_input_train = sagemaker.TrainingInput(s3_data=f's3://{bucket_name}/{prefix}', content_type='csv')
print('Train file saved in the S3 bucket.')

boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'data/test.csv')).upload_file('test.csv')
s3_input_test = sagemaker.TrainingInput(s3_data=f's3://{bucket_name}/{prefix}', content_type='csv')
print('Test file saved in the S3 bucket.')

Train file saved in the S3 bucket.
Test file saved in the S3 bucket.


In [65]:
xgboost_container = sagemaker.image_uris.retrieve("xgboost", boto3.Session().region_name, "1.7-1")

In [63]:
hyperparameters = {
        "max_depth":"5",
        "eta":"0.2",
        "gamma":"4",
        "min_child_weight":"6",
        "subsample":"0.7",
        "objective":"binary:logistic", # We have a classification problem
        "num_round":50
        }

In [69]:
estimator = sagemaker.estimator.Estimator(image_uri=xgboost_container, 
                                          hyperparameters=hyperparameters,
                                          role=sagemaker.get_execution_role(),
                                          instance_count=1, 
                                          instance_type='ml.m5.2xlarge', 
                                          volume_size=5, # 5 GB 
                                          output_path=output_path,
                                          use_spot_instances=True,
                                          max_run=300,
                                          max_wait=600)     

In [73]:
estimator.fit({'train': s3_input_train, 'validation': s3_input_test})

2025-04-12 14:20:45 Starting - Starting the training job...
..25-04-12 14:21:07 Starting - Preparing the instances for training.
.....04-12 14:21:41 Downloading - Downloading the training image.
[34m[2025-04-12 14:22:50.203 ip-10-0-182-115.ec2.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2025-04-12 14:22:50.227 ip-10-0-182-115.ec2.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2025-04-12:14:22:50:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2025-04-12:14:22:50:INFO] Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34m[2025-04-12:14:22:50:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2025-04-12:14:22:50:INFO] Running XGBoost Sagemaker in algorithm mode[0m
[34m[2025-04-12:14:22:50:INFO] Determined 0 GPU(s) available on the instance.[0m
[34m[2025-04-12:14:22:50:INFO] Determined delimiter of CSV input is ','[0

In [74]:
# initial_instance_count indicates the number of compute instances that will be launched to host your deployed model
# initial_instance_count=1: Good for dev/test or low-traffic use.
# initial_instance_count=2+: Useful in production where you expect concurrent requests and want high availability.
xgb_predictor = estimator.deploy(initial_instance_count=1,instance_type='ml.m4.xlarge')

-------!

In [139]:
# Display the model endpoint
xgb_predictor.endpoint_name

'sagemaker-xgboost-2025-04-12-14-35-53-165'

In [96]:
test_data_array = test_df.drop(['Exited'], axis=1).values

xgb_predictor.serializer = CSVSerializer()
xgb_predictor.content_type = 'text/csv'

# Convert array to multi-line CSV string (each row is one sample)
csv_payload = "\n".join([",".join(map(str, row)) for row in test_data_array])

# Send the entire batch
response = xgb_predictor.predict(csv_payload).decode("utf-8")

# Convert the returned string to an array
predictions_array = np.fromstring(response.strip(), sep="\n")

In [97]:
print(predictions_array.shape)

(10000,)


In [None]:
# Convert predictions to 0 or 1 for easier comparison
predictions_class = np.round(predictions_array)

# Create confusion matrix
cm = pd.crosstab(index=test_df['Exited'], columns=predictions_class, rownames=['Observed'], colnames=['Predicted'])

# Extract values from confusion matrix
tn = cm.iloc[0, 0]  # True Negative
fn = cm.iloc[1, 0]  # False Negative
tp = cm.iloc[1, 1]  # True Positive
fp = cm.iloc[0, 1]  # False Positive

# Calculate overall classification rate (accuracy)
accuracy = (tp + tn) / (tp + tn + fp + fn) * 100

# Print overall classification rate
print("\n{:<20}{:<4.1f}%".format("Overall Classification Rate:", accuracy))

# Print confusion matrix
print("{:<15}{:<15}{:>8}".format("Predicted", "No Exited", "Exited"))
print("Observed")
print("{0:<15}{1:<2.0f}% ({2:<}){3:>6.0f}% ({4:<})".format("No Exited", tn/(tn+fn)*100,tn, fp/(tp+fp)*100, fp))
print("{0:<16}{1:<1.0f}% ({2:<}){3:>7.0f}% ({4:<}) \n".format("Exited", fn/(tn+fn)*100,fn, tp/(tp+fp)*100, tp))

In [117]:
real = [100, 27, 3, 4050, 1, 0, 0, 98345]

In [118]:
res_real = xgb_predictor.predict(real).decode("utf-8")

# Convert the returned string to an array
predictions_array = np.fromstring(res_real.strip(), sep="\n")

threshold = 0.5
predicted_class = "Exited" if predictions_array[0] > threshold else "Did not exit"
print(predicted_class)

Did not exit


##### Detele the endpoints

In [140]:
sagemaker.Session().delete_endpoint(xgb_predictor.endpoint)
bucket_to_delete = boto3.resource('s3').Bucket(bucket_name)
bucket_to_delete.objects.all().delete()

[{'ResponseMetadata': {'RequestId': 'GZGVBZ7G38SB7NF2',
   'HostId': 'NaIefYccVmqMC+QPkGCVAHebL4ihXXMf7Z9FD9oY9U1LvixVBVtKcff5qgdV5dZ5kc6wrAM0ojX2OQakEmyTeWkRvLBG+xbGAfFz1wYV/JU=',
   'HTTPStatusCode': 200,
   'HTTPHeaders': {'x-amz-id-2': 'NaIefYccVmqMC+QPkGCVAHebL4ihXXMf7Z9FD9oY9U1LvixVBVtKcff5qgdV5dZ5kc6wrAM0ojX2OQakEmyTeWkRvLBG+xbGAfFz1wYV/JU=',
    'x-amz-request-id': 'GZGVBZ7G38SB7NF2',
    'date': 'Sat, 12 Apr 2025 15:23:39 GMT',
    'connection': 'close',
    'content-type': 'application/xml',
    'transfer-encoding': 'chunked',
    'server': 'AmazonS3'},
   'RetryAttempts': 0},
  'Deleted': [{'Key': 'decision-tree-as-a-built-in-algo/output/sagemaker-xgboost-2025-04-12-14-20-44-997/profiler-output/system/incremental/2025041214/1744467660.algo-1.json'},
   {'Key': 'decision-tree-as-a-built-in-algo/data/test.csv'},
   {'Key': 'decision-tree-as-a-built-in-algo/output/sagemaker-xgboost-2025-04-12-14-20-44-997/debug-output/index/000000000/000000000010_worker_0.json'},
   {'Key': 'de