In [1]:
bucket = 'lpbkt123456'
prefix = 'sagemaker/xgboost_loan_risk'

In [2]:
import boto3
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import sagemaker
from sagemaker import get_execution_role
from sagemaker.inputs import TrainingInput
from sagemaker.serializers import CSVSerializer

role = get_execution_role()

In [3]:

data_key = 'training.csv'
data_location = 's3://{}/{}'.format(bucket, data_key)


In [4]:
dataset = pd.read_csv(data_location)
pd.set_option('display.max_rows', 8)
pd.set_option('display.max_columns', 15)
dataset.shape
dataset

Unnamed: 0.1,Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,2,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,3,0,0.658180,38,1,0.085113,3042.0,2,1,0,0,0.0
3,4,0,0.233810,30,0,0.036050,3300.0,5,0,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
149996,149997,0,0.299745,44,0,0.716562,5584.0,4,0,1,0,2.0
149997,149998,0,0.246044,58,0,3870.000000,,18,0,1,0,0.0
149998,149999,0,0.000000,30,0,0.000000,5716.0,4,0,0,0,0.0
149999,150000,0,0.850283,64,0,0.249908,8158.0,8,0,2,0,0.0


In [5]:
dataset = dataset.drop('Unnamed: 0', axis=1)
#Feature engineering steps
#drop the anamolies. Drop all the NaN values for monthlyincome and number of dependents
dataset.dropna(subset = ["MonthlyIncome","NumberOfDependents"], inplace=True)
dataset.drop(dataset[dataset.DebtRatio > 1.0].index, inplace=True)
#dataset.drop('Unnamed: 0', axis=1)
dataset.shape
dataset


Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,0,0.658180,38,1,0.085113,3042.0,2,1,0,0,0.0
3,0,0.233810,30,0,0.036050,3300.0,5,0,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
149995,0,0.040674,74,0,0.225131,2100.0,4,0,1,0,0.0
149996,0,0.299745,44,0,0.716562,5584.0,4,0,1,0,2.0
149998,0,0.000000,30,0,0.000000,5716.0,4,0,0,0,0.0
149999,0,0.850283,64,0,0.249908,8158.0,8,0,2,0,0.0


In [6]:
dataset.shape

(113036, 11)

In [7]:
dataset.shape
dataset.head

<bound method NDFrame.head of         SeriousDlqin2yrs  RevolvingUtilizationOfUnsecuredLines  age  \
0                      1                              0.766127   45   
1                      0                              0.957151   40   
2                      0                              0.658180   38   
3                      0                              0.233810   30   
...                  ...                                   ...  ...   
149995                 0                              0.040674   74   
149996                 0                              0.299745   44   
149998                 0                              0.000000   30   
149999                 0                              0.850283   64   

        NumberOfTime30-59DaysPastDueNotWorse  DebtRatio  MonthlyIncome  \
0                                          2   0.802982         9120.0   
1                                          0   0.121876         2600.0   
2                                    

In [8]:
train_data, validation_data, test_data = np.split(dataset.sample(frac=1, random_state=1729), [int(0.7 * len(dataset)), int(0.9 * len(dataset))])
train_data.to_csv('train.csv', header=False, index=False)
validation_data.to_csv('validation.csv', header=False, index=False)

In [9]:
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv')
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'validation/validation.csv')).upload_file('validation.csv')
s3_input_train = TrainingInput(s3_data='s3://{}/{}/train'.format(bucket, prefix), content_type='csv')
s3_input_validation = TrainingInput(s3_data='s3://{}/{}/validation/'.format(bucket, prefix), content_type='csv')

In [10]:
dataset.sample(frac=1, random_state=1729)

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
125044,0,0.994626,42,0,0.485642,2750.0,7,0,1,0,2.0
103075,0,0.125874,23,0,0.000760,3945.0,1,0,0,0,0.0
92160,0,0.195645,32,0,0.035327,6000.0,3,0,0,0,0.0
54515,0,0.076923,81,0,0.021475,4283.0,7,0,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
72694,0,0.594083,39,0,0.675020,3750.0,11,0,1,0,0.0
88611,0,0.228149,46,0,0.539906,3833.0,11,0,1,0,0.0
91159,0,0.117586,37,0,0.263462,13333.0,11,0,2,0,2.0
2228,0,0.000000,43,1,0.260749,13000.0,15,0,2,0,2.0


In [11]:
# containers = {'us-west-2': '433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:latest',
#               'us-east-1': '811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest',
#               'us-east-2': '825641698319.dkr.ecr.us-east-2.amazonaws.com/xgboost:latest',
#               'eu-west-1': '685385470294.dkr.ecr.eu-west-1.amazonaws.com/xgboost:latest'}

container = sagemaker.image_uris.retrieve("xgboost", boto3.Session().region_name, "latest")
sess = sagemaker.Session()

In [89]:
#xgb_predictor = xgb.deploy(
#	initial_instance_count = 1,
#	instance_type = 'ml.m4.xlarge',
#	serializer = CSVSerializer())

-------!

In [12]:
xgb = sagemaker.estimator.Estimator(container,
                                    role, 
                                    instance_count=1, 
                                    instance_type='ml.m4.xlarge',
                                    output_path='s3://{}/{}/output'.format(bucket, prefix),
                                    sagemaker_session=sess)
#xgb.set_hyperparameters(eta=0.1, objective='binary:logistic', num_round=25) 
#xgb.fit({'train': s3_input_train, 'validation': s3_input_validation})
xgb.set_hyperparameters(
    eval_metric="auc",
    objective="binary:logistic",
    num_round=100,
    rate_drop=0.3,
    tweedie_variance_power=1.4,
)

In [14]:
from sagemaker.tuner import (
    IntegerParameter,
    CategoricalParameter,
    ContinuousParameter,
    HyperparameterTuner,
)

hyperparameter_ranges = {
    "eta": ContinuousParameter(0, 1),
    "min_child_weight": ContinuousParameter(1, 10),
    "alpha": ContinuousParameter(0, 2),
    "max_depth": IntegerParameter(1, 10),
}

objective_metric_name = "validation:auc"
tuner = HyperparameterTuner(
    xgb, objective_metric_name, hyperparameter_ranges, max_jobs=20, max_parallel_jobs=3
)
tuner.fit({"train": s3_input_train, "validation": s3_input_validation}, include_cls_metadata=False)


...........................................................................................................................................................................................................................................................................................................................!


In [15]:
boto3.client("sagemaker").describe_hyper_parameter_tuning_job(
    HyperParameterTuningJobName=tuner.latest_tuning_job.job_name
)["HyperParameterTuningJobStatus"]

#deploy the end point
xgb_predictor = tuner.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')


2022-03-01 06:43:15 Starting - Preparing the instances for training
2022-03-01 06:43:15 Downloading - Downloading input data
2022-03-01 06:43:15 Training - Training image download completed. Training in progress.
2022-03-01 06:43:15 Uploading - Uploading generated training model
2022-03-01 06:43:15 Completed - Training job completed
-------!

In [19]:
xgb_predictor.serializer = sagemaker.serializers.CSVSerializer()

In [20]:
def predict(data, rows=500):
    split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))
    predictions = ''
    for array in split_array:
        predictions = ','.join([predictions, xgb_predictor.predict(array).decode('utf-8')])

    return np.fromstring(predictions[1:], sep=',')


predictions = predict(test_data.to_numpy()[:,1:])
predictions

array([0.01641897, 0.01056557, 0.01227011, ..., 0.03497043, 0.01936392,
       0.04212051])

In [21]:
sa1= np.array_split(test_data.to_numpy()[:,1:], int(test_data.to_numpy()[:,1:].shape[0] / float(500) + 1))
sa1[1] 

array([[6.79479050e-02, 4.50000000e+01, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 2.00000000e+00],
       [9.36446462e-01, 8.10000000e+01, 0.00000000e+00, ...,
        1.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [9.86813510e-02, 6.80000000e+01, 0.00000000e+00, ...,
        1.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [1.97036498e-01, 4.80000000e+01, 1.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 2.00000000e+00],
       [3.14742806e-01, 3.70000000e+01, 0.00000000e+00, ...,
        1.00000000e+00, 0.00000000e+00, 2.00000000e+00],
       [3.11597405e-01, 4.80000000e+01, 0.00000000e+00, ...,
        1.00000000e+00, 0.00000000e+00, 0.00000000e+00]])

In [22]:
pd.crosstab(
    index=test_data.iloc[:, 0],
    columns=np.round(predictions),
    rownames=["actual"],
    colnames=["predictions"],
)

predictions,0.0,1.0
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,10510,108
1,557,129


In [23]:
cm = pd.crosstab(index=test_data['SeriousDlqin2yrs'], columns=np.round(predictions), rownames=['Observed'], colnames=['Predicted'])
tn = cm.iloc[0,0]; fn = cm.iloc[1,0]; tp = cm.iloc[1,1]; fp = cm.iloc[0,1]; p = (tp+tn)/(tp+tn+fp+fn)*100
print("\n{0:<20}{1:<4.1f}%\n".format("Overall Classification Rate: ", p))
print("{0:<15}{1:<15}{2:>8}".format("Predicted", "No Default", "Default"))
print("Observed")
print("{0:<15}{1:<2.0f}% ({2:<}){3:>6.0f}% ({4:<})".format("No Default", tn/(tn+fn)*100,tn, fp/(tp+fp)*100, fp))
print("{0:<16}{1:<1.0f}% ({2:<}){3:>7.0f}% ({4:<}) \n".format("Default", fn/(tn+fn)*100,fn, tp/(tp+fp)*100, tp))


Overall Classification Rate: 94.1%

Predicted      No Default      Default
Observed
No Default     95% (10510)    46% (108)
Default         5% (557)     54% (129) 



In [29]:
xgb_predictor.delete_endpoint()

ClientError: An error occurred (ValidationException) when calling the DescribeEndpoint operation: Could not find endpoint "arn:aws:sagemaker:us-west-2:425165844512:endpoint/xgboost-2022-02-09-09-13-23-671".