# AWS Sagemaker Tutorial (XGBoost Classifier)

Loan Classification

Data source: [source](https://datahack.analyticsvidhya.com/contest/practice-problem-loan-prediction-iii/#About)

## 1. Set the AWS and Prepare the Data

In [1]:
# After creating sagemaker notebook instance

import numpy as np
import pandas as pd
import os
from io import StringIO
import boto3, sagemaker
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.session import s3_input, Session
from sagemaker.predictor import csv_serializer

In [2]:
# After creating a bucket
# Name the bucket
bucket_name = "sagemaker-loan-classification"
# Check the region name
region = boto3.session.Session().region_name
print(region)

ap-southeast-1


In [3]:
# Set the output path for the saved model
prefix = "xgboost"
output_path = f"s3://{bucket_name}/{prefix}/saved_model"

In [4]:
# Load data
data = pd.read_csv(f"s3://{bucket_name}/{prefix}/raw_data/loan_data.csv")
data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [5]:
# Label feature must be the first feature
data = data.dropna(axis=0)
data = data.drop(columns=["Loan_ID"])
data["Gender"] = np.where(data["Gender"] == "Male", 1, 0)
data["Married"] = np.where(data["Married"] == "Yes", 1, 0)
data["Dependents"] = np.where(
    data["Dependents"] == "0",
    0,
    np.where(data["Dependents"] == "1", 1, np.where(data["Dependents"] == "2", 2, 3)),
)
data["Education"] = np.where(data["Education"] == "Graduate", 1, 0)
data["Self_Employed"] = np.where(data["Self_Employed"] == "Yes", 1, 0)
data["Loan_Status"] = np.where(data["Loan_Status"] == "Y", 1, 0)
data = pd.get_dummies(data, columns=["Property_Area"])
label = data.pop("Loan_Status")
data.insert(0, "Loan_Status", label)
data.head()

Unnamed: 0,Loan_Status,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban
1,0,1,1,1,1,0,4583,1508.0,128.0,360.0,1.0,1,0,0
2,1,1,1,0,1,1,3000,0.0,66.0,360.0,1.0,0,0,1
3,1,1,1,0,0,0,2583,2358.0,120.0,360.0,1.0,0,0,1
4,1,1,0,0,1,0,6000,0.0,141.0,360.0,1.0,0,0,1
5,1,1,1,2,1,1,5417,4196.0,267.0,360.0,1.0,0,0,1


In [6]:
# Train - test split
train, test = np.split(data.sample(frac=1, random_state=123), [int(0.8 * len(data))])

In [7]:
# Save the train and test data
s3_resource = boto3.resource("s3")

# Save the files to s3 bucket
csv_io = StringIO()
train.to_csv(csv_io, index=False, header=False)
s3_resource.Object(bucket_name, f"{prefix}/train/train.csv").put(Body=csv_io.getvalue())

csv_io = StringIO()
test.to_csv(csv_io, index=False, header=False)
s3_resource.Object(bucket_name, f"{prefix}/test/test.csv").put(Body=csv_io.getvalue())

{'ResponseMetadata': {'RequestId': 'PS1S9Y33ECYMTV05',
  'HostId': '9nmBOODbuP+EsmJYFrT8qvSSN+tcvxkdcvt3Qo2eLx7wZsVadxxBEY4K62n9OMJdbyGXiVhacyM=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': '9nmBOODbuP+EsmJYFrT8qvSSN+tcvxkdcvt3Qo2eLx7wZsVadxxBEY4K62n9OMJdbyGXiVhacyM=',
   'x-amz-request-id': 'PS1S9Y33ECYMTV05',
   'date': 'Sun, 16 Jan 2022 08:15:16 GMT',
   'etag': '"815fcfa2f94b8b9f2b2e73009d439bd4"',
   'server': 'AmazonS3',
   'content-length': '0'},
  'RetryAttempts': 0},
 'ETag': '"815fcfa2f94b8b9f2b2e73009d439bd4"'}

In [8]:
# Set input path for sagemaker
input_train = sagemaker.TrainingInput(s3_data=f"s3://{bucket_name}/{prefix}/train", content_type="csv")
input_test = sagemaker.TrainingInput(s3_data=f"s3://{bucket_name}/{prefix}/test", content_type="csv")

## 2. Create and Train the Model

In [9]:
# Build the estimator
container = sagemaker.image_uris.retrieve("xgboost", region, "latest")

xgb = sagemaker.estimator.Estimator(
    image_uri=container,
    role=sagemaker.get_execution_role(),
    instance_count=1,
    instance_type="ml.m5.xlarge",
    output_path=output_path,
    sagemaker_session=sagemaker.Session(),
    use_spot_instance=True,
    max_run=300,
    #max_wait=600,
    base_job_name="xgboost-loan",
)

xgb.set_hyperparameters(
    colsample_bytree=0.478,
    gamma=2.387,
    eta=0.175,
    max_depth=15,
    min_child_weight=7,
    num_round=86,
    subsample=0.80,
    num_class=2,
    objective= "multi:softmax", #"binary:logistic"
)

In [10]:
# Role
print(container)
print(sagemaker.get_execution_role())

475088953585.dkr.ecr.ap-southeast-1.amazonaws.com/xgboost:latest
arn:aws:iam::453160297490:role/service-role/AmazonSageMaker-ExecutionRole-20211226T170651


In [11]:
# Fit the training data
xgb.fit(
    {
        "train":input_train,
        "validation":input_test
    }
)

2022-01-16 08:16:07 Starting - Starting the training job...
2022-01-16 08:16:09 Starting - Launching requested ML instancesProfilerReport-1642320967: InProgress
......
2022-01-16 08:17:38 Starting - Preparing the instances for training......
2022-01-16 08:18:38 Downloading - Downloading input data...
2022-01-16 08:19:01 Training - Training image download completed. Training in progress.
2022-01-16 08:19:01 Uploading - Uploading generated training model[34mArguments: train[0m
[34m[2022-01-16:08:18:56:INFO] Running standalone xgboost training.[0m
[34m[2022-01-16:08:18:56:INFO] File size need to be processed in the node: 0.02mb. Available memory size in the node: 8026.28mb[0m
[34m[2022-01-16:08:18:56:INFO] Determined delimiter of CSV input is ','[0m
[34m[08:18:56] S3DistributionType set as FullyReplicated[0m
[34m[08:18:56] 384x13 matrix with 4992 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2022-01-16:08:18:56:INFO] Determined del

## 3. Deploy Model and Make Predictions

In [None]:
# Deployment
xgb_model = xgb.deploy(
    initial_instance_count=1,
    instance_type="ml.m4.xlarge",
    endpoint_name="xgboost-loan",
)

In [37]:
xgb_model = sagemaker.predictor.Predictor(endpoint_name="xgboost-loan")

In [38]:
xgb_model.endpoint_name

'xgboost-loan'

In [39]:
from sagemaker.serializers import CSVSerializer

In [40]:
# Making prediction
test_load = test
test_load = test.drop(columns=["Loan_Status"]).values
xgb_model.serializer = CSVSerializer() # set the serializer type
pred = xgb_model.predict(test_load).decode('utf-8')
pred = np.fromstring(pred[1:], sep=',') # convert to an array
print(pred.shape)
print(pred)

(95,)
[0. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 0.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 0. 1. 1.
 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


## 4. Evaluate the Model Score

In [49]:
# Evaluate the model
confusion_matrix = pd.crosstab(index=test["Loan_Status"], columns=np.round(pred), rownames=['True Values'], colnames=['Predicted Values'])
tn = cm.iloc[0,0]; fn = cm.iloc[1,0]; tp = cm.iloc[1,1]; fp = cm.iloc[0,1]; p = (tp+tn)/(tp+tn+fp+fn)*100
print("\n{0:<20}{1:<4.1f}%\n".format("Classification Accuracy: ", p))
print("{0:<15}{1:<15}{2:>8}".format("Predicted", "Non-Performing", "Performing"))
print("True Values")
print("{0:<15}{1:<2.0f}% ({2:<}){3:>6.0f}% ({4:<})".format("Non-Performing", tn/(tn+fn)*100,tn, fp/(tp+fp)*100, fp))
print("{0:<16}{1:<1.0f}% ({2:<}){3:>7.0f}% ({4:<}) \n".format("Performing", fn/(tn+fn)*100,fn, tp/(tp+fp)*100, tp))


Classification Accuracy: 73.7%

Predicted      Non-Performing Performing
True Values
Non-Performing 62% (5)    25% (22)
Performing      38% (3)     75% (65) 



## 5. Download the Notebook and Delete the Resources

In [50]:
xgb_model.endpoint

The endpoint attribute has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


'xgboost-loan'

In [None]:
# Download the notebook

# Delete the resources
sagemaker.Session().delete_endpoint(xgb_model.endpoint)
delete_bucket = boto3.resource("s3").Bucket(bucket_name)
delete_bucket.objects.all().delete()