In [1]:
import configparser, urllib.request, os, time
import boto3, sagemaker, pandas as pd, numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sagemaker.inputs import TrainingInput
from sagemaker.amazon.amazon_estimator import image_uris
from sagemaker.serializers import CSVSerializer
from sagemaker.session import Session


  from pandas.core.computation.check import NUMEXPR_INSTALLED


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


# Reading config files

In [2]:
CFG_FILE = "aws_config.cfg"
# Read credentails and region from config file
cp=configparser.ConfigParser()
if not cp.read(CFG_FILE):
    raise FileNotFoundError(
        f"Missing {CFG_FILE}.Expected AWS access key id and security access key id "
        )

In [3]:
cp["default"]["aws_access_key_id"]

'AKIA3TD2S3GE2Z6GPRG2'

## Boto Library
Work and Track AWS INFRA

In [4]:
s3_client = boto3.client('s3')
BUCKET = "sagemaker-mlop-test"
region = cp["default"].get("region","us-east-1")

try:
    if region == 'us-east-1':
        s3_client.create_bucket(Bucket=BUCKET)
    else:
        s3_client.create_bucket(
            Bucket=BUCKET,CreateBucketConfiguration={"LocationConstraint":region}
                               )
    print('s3 bucket created successfully',BUCKET)
except s3_client.exceptions.BucketAlreadyOwnedByYou as e:
    print('use existing s3 bucket',BUCKET)

use existing s3 bucket sagemaker-mlop-test


## using Boto operations for interacting with S3

In [5]:
# List S3 buckets
response = s3_client.list_buckets()
print("Available S3 Buckets:")
for bucket in response['Buckets']:
    print(f" - {bucket['Name']} (Created: {bucket['CreationDate']})")

Available S3 Buckets:
 - sagemaker-mlop-test (Created: 2025-07-17 11:07:21+00:00)


In [6]:
# Upload a file to S3
local_file_path = 'empty.txt'
s3_key = 's3_empty.txt'
open('empty.txt', 'a').close()

s3_client.upload_file(local_file_path, BUCKET, s3_key)
print(f"Successfully uploaded {local_file_path} to s3://{BUCKET}/{s3_key}")

Successfully uploaded empty.txt to s3://sagemaker-mlop-test/s3_empty.txt


In [7]:
# Download the file back from S3
local_file_path = 'downloaded_from_s3.txt'
s3_client.download_file(BUCKET, s3_key, local_file_path)
print(f"Successfully downloaded s3://{BUCKET}/{s3_key} to {local_file_path}")

Successfully downloaded s3://sagemaker-mlop-test/s3_empty.txt to downloaded_from_s3.txt


In [8]:
# List objects in the S3 bucket
response = s3_client.list_objects_v2(Bucket=BUCKET)

print(f"Contents of s3://{BUCKET}:")
for obj in response['Contents']:
    print(f" - {obj['Key']} (Size: {obj['Size']} bytes, Modified: {obj['LastModified']})")

Contents of s3://sagemaker-mlop-test:
 - s3_empty.txt (Size: 0 bytes, Modified: 2025-07-17 15:00:29+00:00)
 - xgboost-bank/output/sagemaker-xgboost-2025-07-17-14-36-03-713/debug-output/claim.smd (Size: 0 bytes, Modified: 2025-07-17 14:37:55+00:00)
 - xgboost-bank/output/sagemaker-xgboost-2025-07-17-14-36-03-713/debug-output/collections/000000000/worker_0_collections.json (Size: 6282 bytes, Modified: 2025-07-17 14:37:55+00:00)
 - xgboost-bank/output/sagemaker-xgboost-2025-07-17-14-36-03-713/debug-output/events/000000000000/000000000000_worker_0.tfevents (Size: 230 bytes, Modified: 2025-07-17 14:37:55+00:00)
 - xgboost-bank/output/sagemaker-xgboost-2025-07-17-14-36-03-713/debug-output/events/000000000010/000000000010_worker_0.tfevents (Size: 236 bytes, Modified: 2025-07-17 14:37:55+00:00)
 - xgboost-bank/output/sagemaker-xgboost-2025-07-17-14-36-03-713/debug-output/events/000000000020/000000000020_worker_0.tfevents (Size: 236 bytes, Modified: 2025-07-17 14:37:55+00:00)
 - xgboost-bank/ou

# Downloading Datasets

In [9]:
# Downloading Dataset
DATA_URL = (
    "https://d1.awsstatic.com/tmt/build-train-deploy-machine-learning-model-"
    "sagemaker/bank_clean.27f01fbbdf43271788427f3682996ae29ceca05d.csv"
)

# Define local file path
LOCAL_CSV = "bank_clean.csv"

if not os.path.exists(LOCAL_CSV):
    urllib.request.urlretrieve(DATA_URL, LOCAL_CSV)
    print("Downloaded", LOCAL_CSV)
else:
    print("Dataset already present -", LOCAL_CSV)

df = pd.read_csv(LOCAL_CSV, index_col=0)
print("Shape:", df.shape)
display(df.head())

Dataset already present - bank_clean.csv
Shape: (41188, 61)


Unnamed: 0,age,campaign,pdays,previous,no_previous_contact,not_working,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,...,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_nonexistent,poutcome_success,y_no,y_yes
0,56,1,999,0,1,0,0,0,0,1,...,0,1,0,0,0,0,1,0,1,0
1,57,1,999,0,1,0,0,0,0,0,...,0,1,0,0,0,0,1,0,1,0
2,37,1,999,0,1,0,0,0,0,0,...,0,1,0,0,0,0,1,0,1,0
3,40,1,999,0,1,0,1,0,0,0,...,0,1,0,0,0,0,1,0,1,0
4,56,1,999,0,1,0,0,0,0,0,...,0,1,0,0,0,0,1,0,1,0


In [10]:
### Train Test split
train_df,test_df = train_test_split(
    df, test_size=0.3,shuffle=True, random_state=1729
)

## If large data better to use np.split
# import numpy as np
# train_data, test_data = np.split(model_data.sample(frac=1, random_state=1729), [int(0.7 * len(model_data))])
# print(train_data.shape, test_data.shape)

In [11]:
PREFIX = 'xgboost-bank'

In [12]:
train_pay_load=pd.concat([train_df['y_yes'],train_df.drop(['y_yes','y_no'],axis=1)],axis=1).head()
train_csv_path="train.csv"
train_pay_load.to_csv(train_csv_path,index=False,header=False)

s3_train_key=f"{PREFIX}/train/{train_csv_path}"
print(s3_train_key)
s3_client.upload_file(train_csv_path, BUCKET, s3_train_key)
print(f"Successfully uploaded training data to  s3://{BUCKET}/{s3_train_key}")

xgboost-bank/train/train.csv
Successfully uploaded training data to  s3://sagemaker-mlop-test/xgboost-bank/train/train.csv


In [13]:
test_pay_load=pd.concat([test_df['y_yes'],test_df.drop(['y_yes','y_no'],axis=1)],axis=1).head()
test_csv_path="test.csv"
test_pay_load.to_csv(test_csv_path,index=False,header=False)

s3_test_key=f"{PREFIX}/test/{test_csv_path}"
print(s3_test_key)
s3_client.upload_file(test_csv_path, BUCKET, s3_test_key)
print(f"Successfully uploaded testing data to  s3://{BUCKET}/{s3_test_key}")

xgboost-bank/test/test.csv
Successfully uploaded testing data to  s3://sagemaker-mlop-test/xgboost-bank/test/test.csv


In [14]:
# Define S3 paths
f"s3://{BUCKET}/{PREFIX}/train/"
# Example path: 's3://mlops-dsml-june27/xgboost-bank/train/'

# Create TrainingInput objects
s3_train = TrainingInput(f"s3://{BUCKET}/{PREFIX}/train/", content_type="text/csv")
s3_val   = TrainingInput(f"s3://{BUCKET}/{PREFIX}/test/", content_type="text/csv")




In [15]:
# Retrieve XGBoost container image
container = image_uris.retrieve('xgboost', boto3.Session().region_name, '1.5-1')

hyperparameters = {
    "max_depth": "4",               # Slightly shallower tree to prevent overfitting
    "eta": "0.1",                   # Lower learning rate for better generalization
    "gamma": "2",                   # Reduced to allow more splits
    "min_child_weight": "1",        # Allows splits that are more sensitive to class 1
    "scale_pos_weight": "9.0",      # IMPORTANT: imbalance correction (ratio of class 0 / class 1)
    "subsample": "0.8",             # Slightly increased to improve training diversity
    "colsample_bytree": "0.8",      # Added to prevent overfitting
    "objective": "binary:logistic", # Binary classification task
    "eval_metric": "auc",           # Good metric for imbalanced classes
    "num_round": "100",             # More boosting rounds to learn complex patterns
    "verbosity": "1"
}


estimator = sagemaker.estimator.Estimator(image_uri=container, 
                                          hyperparameters=hyperparameters,
                                          role=sagemaker.get_execution_role(),
                                          instance_count=1, 
                                          instance_type='ml.m5.2xlarge', 
                                          volume_size=5, # 5 GB 
                                          output_path=f"s3://{BUCKET}/{PREFIX}/output")

In [16]:
print("Starting training ...")

estimator.fit(
    {"train": s3_train, "validation": s3_val},
    wait=True
)

print("Training job completed -", estimator.latest_training_job.name)


INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2025-07-17-15-00-29-436


Starting training ...
2025-07-17 15:00:29 Starting - Starting the training job.........
2025-07-17 15:02:00 Downloading - Downloading input data...
2025-07-17 15:02:15 Downloading - Downloading the training image...
2025-07-17 15:02:55 Training - Training image download completed. Training in progress....
  from pandas import MultiIndex, Int64Index[0m
[34m[2025-07-17 15:03:18.627 ip-10-0-238-205.us-west-2.compute.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2025-07-17 15:03:18.649 ip-10-0-238-205.us-west-2.compute.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2025-07-17:15:03:18:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2025-07-17:15:03:18:INFO] Failed to parse hyperparameter eval_metric value auc to Json.[0m
[34mReturning the value itself[0m
[34m[2025-07-17:15:03:18:INFO] Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m

# Deployment

In [17]:
# 6. Deploy Realtime Endpoint
print("Deploying endpoint – this may take a few minutes")

predictor = estimator.deploy(
    initial_instance_count=1,
    instance_type="ml.m5.2xlarge",
    serializer=CSVSerializer()
)

endpoint_name = predictor.endpoint_name
print("Endpoint active –", endpoint_name)


INFO:sagemaker:Creating model with name: sagemaker-xgboost-2025-07-17-15-04-41-379


Deploying endpoint – this may take a few minutes


INFO:sagemaker:Creating endpoint-config with name sagemaker-xgboost-2025-07-17-15-04-41-379
INFO:sagemaker:Creating endpoint with name sagemaker-xgboost-2025-07-17-15-04-41-379


------!Endpoint active – sagemaker-xgboost-2025-07-17-15-04-41-379


In [18]:
# 7. Evaluate on the Test Set
X_test = test_df.drop(["y_no", "y_yes"], axis=1).values

probs_text = predictor.predict(X_test).decode("utf-8").strip().split("\n")
probs = np.asarray(probs_text, dtype=float)

y_true = test_df["y_yes"].values

print("Confusion Matrix:\n", confusion_matrix(y_true, probs > 0.5))
print(
    "\nClassification Report:\n",
    classification_report(y_true, probs > 0.5, digits=4)
)


Confusion Matrix:
 [[11032     0]
 [ 1325     0]]

Classification Report:
               precision    recall  f1-score   support

           0     0.8928    1.0000    0.9433     11032
           1     0.0000    0.0000    0.0000      1325

    accuracy                         0.8928     12357
   macro avg     0.4464    0.5000    0.4717     12357
weighted avg     0.7970    0.8928    0.8422     12357



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
