### Download Dataset

In [1]:
!pip install openpyxl
!pip install xlrd



In [9]:
import pandas as pd
import boto3
import os
import sagemaker
# Download the dataset
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/00350/default%20of%20credit%20card%20clients.xls -O credit_card_default.xls --no-check-certificate


--2025-04-15 06:17:39--  https://archive.ics.uci.edu/ml/machine-learning-databases/00350/default%20of%20credit%20card%20clients.xls
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
connected. to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... 
  Unable to locally verify the issuer's authority.
200 OKequest sent, awaiting response... 
Length: unspecified
Saving to: ‘credit_card_default.xls’

credit_card_default     [   <=>              ]   5.28M  9.40MB/s    in 0.6s    

2025-04-15 06:17:40 (9.40 MB/s) - ‘credit_card_default.xls’ saved [5539328]



In [3]:
df = pd.read_excel('credit_card_default.xls', header=1)
df.to_csv('credit_card_default.csv', index=False)


#### Use the SageMaker session to upload the dataset to an S3 bucket:​

In [4]:


session = sagemaker.Session()
bucket = session.default_bucket()
prefix = 'sagemaker/xgboost-credit'

print("Bucket in Use ", bucket)
# Upload the dataset
input_path = session.upload_data('credit_card_default.csv', bucket=bucket, key_prefix=prefix)
print(f'Dataset uploaded to: {input_path}')


Bucket in Use  sagemaker-us-east-1-589347638345
Dataset uploaded to: s3://sagemaker-us-east-1-589347638345/sagemaker/xgboost-credit/credit_card_default.csv


In [5]:
df.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
1,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,3,90000,2,2,2,34,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,4,50000,2,2,1,37,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,5,50000,1,2,1,57,-1,0,-1,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


#### Preprocess Data : 
##### Split the dataset into training and testing sets:

In [6]:
from sklearn.model_selection import train_test_split

# Separate features and target
X = df.drop('default payment next month', axis=1)
y = df['default payment next month']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Combine features and target for training and testing
train_data = pd.concat([y_train, X_train], axis=1)
test_data = pd.concat([y_test, X_test], axis=1)

# Save to CSV
train_data.to_csv('train.csv', index=False, header=False)
test_data.to_csv('test.csv', index=False, header=False)


#### Upload Split Data to S3

In [7]:
# Upload training data
train_input = session.upload_data('train.csv', bucket=bucket, key_prefix=f'{prefix}/train')

# Upload testing data
test_input = session.upload_data('test.csv', bucket=bucket, key_prefix=f'{prefix}/test')

#### Train The Model
##### Configure the XGBoost estimator:

In [12]:
from sagemaker import image_uris
from sagemaker.estimator import Estimator

# Get the XGBoost image URI
xgboost_image_uri = image_uris.retrieve('xgboost', region=session.boto_region_name, version='1.0-1')

# Define the estimator
xgb_estimator = Estimator(
    image_uri=xgboost_image_uri,
    # role=sagemaker.get_execution_role(),
    role = "arn:aws:iam::589347638345:role/service-role/AmazonSageMaker-ExecutionRole-20250415T102120",
    instance_count=1,
    instance_type='ml.m5.large',
    output_path=f's3://{bucket}/{prefix}/output',
    sagemaker_session=session
)

# Set hyperparameters
xgb_estimator.set_hyperparameters(
    objective='binary:logistic',
    num_round=100
)


#### Specify the data channels and initiate training

In [13]:
from sagemaker.inputs import TrainingInput

# Define data channels
train_channel = TrainingInput(train_input, content_type='csv')

# Start training
xgb_estimator.fit({'train': train_channel})


2025-04-15 06:20:02 Starting - Starting the training job...
2025-04-15 06:20:25 Starting - Preparing the instances for trainingProfilerReport-1744698002: InProgress
..
.....04-15 06:20:55 Downloading - Downloading input data.
..25-04-15 06:21:55 Downloading - Downloading the training image.
[34m[2025-04-15 06:22:24.289 ip-10-0-157-8.ec2.internal:7 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Single node training.[0m
[34mINFO:root:Trai

#### Deploy the model to a real-time inference endpoint

In [None]:
from sagemaker.serializers import CSVSerializer

# xgb_predictor = xgb_estimator.deploy(
#     initial_instance_count=1,
#     instance_type='ml.m5.large'
# )

xgb_predictor = xgb_estimator.deploy(
    initial_instance_count=1,
    instance_type='ml.m5.large',
    serializer=CSVSerializer()
)


---

#### Use the deployed endpoint to make predictions

In [None]:
import pandas as pd

# Assuming X_test is your test DataFrame
sample = X_test.iloc[0]
sample_df = pd.DataFrame([sample])
sample_csv = sample_df.to_csv(header=False, index=False)


prediction = xgb_predictor.predict(sample_csv)
print(prediction)

In [None]:
probability = float(prediction.decode('utf-8'))

# Apply threshold
threshold = 0.5
predicted_class = int(probability >= threshold)

print(f"Predicted class: {predicted_class}")

In [None]:
y_test.iloc[0]

#### To avoid incurring charges delete the endpoint when done

In [None]:
# Delete the endpoint
xgb_predictor.delete_endpoint()
