### 1. Create a S3 bucket

In [1]:
import boto3
s3 = boto3.resource('s3')

In [32]:
bucket_name = 'loan-default-predictor'
try:
    s3.create_bucket(Bucket=bucket_name)
    print('S3 bucket has been created successfully')
except Exception as e:
    print('S3 error: ', e)

S3 bucket has been created successfully


### 2. Loading and transforming the data

In [3]:
import pandas as pd
df=pd.read_csv('cresent_bank_short_data.csv')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13718 entries, 0 to 13717
Data columns (total 9 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   masked_acct              13718 non-null  object 
 1   MoodysUVIDiffCOFromOrig  13718 non-null  float64
 2   VehicleValueBlackBook    13697 non-null  float64
 3   FinancedAmt              13718 non-null  float64
 4   PaymentAmt               13718 non-null  float64
 5   TotalDownPmt             13718 non-null  float64
 6   ChargeOffMOB             13718 non-null  int64  
 7   FICOScore                9504 non-null   float64
 8   true_recovery_rate       13718 non-null  float64
dtypes: float64(7), int64(1), object(1)
memory usage: 964.7+ KB


In [6]:
df

Unnamed: 0,masked_acct,MoodysUVIDiffCOFromOrig,VehicleValueBlackBook,FinancedAmt,PaymentAmt,TotalDownPmt,ChargeOffMOB,FICOScore,true_recovery_rate
0,0001374cd7856b8999ef0b661d515142,2.39,6100.0,10397.4502,282.20,750.0,15,535.0,0.1856
1,000170f205857037065589a606caae25,1.49,17575.0,22360.2500,538.19,2000.0,8,652.0,0.5442
2,00060788de8277720911f32a3b355754,-7.92,11225.0,11444.6602,278.61,4000.0,27,534.0,0.0000
3,000b9612a76d2800a8daad266c9e1eb1,-7.78,14050.0,16533.8203,377.50,1500.0,18,467.0,0.3586
4,000c9915e9eb935ee32c691e2f3efb49,0.56,10700.0,15157.8096,347.81,1200.0,8,495.0,0.5889
...,...,...,...,...,...,...,...,...,...
13713,ffe1824fff3e03deb2b60ec3085f220e,-7.49,8525.0,10748.2002,315.47,2000.0,28,,0.1990
13714,ffe620e2913ec71c7237dffba00e4f88,-2.81,10175.0,14161.3896,339.04,1500.0,8,529.0,0.3380
13715,fff3e5cfb61567b4cbcde156dc646226,-5.94,13175.0,17433.4297,395.53,5000.0,20,503.0,0.4633
13716,fffb70be4982e918fe859491c7f1c043,-8.55,20900.0,25519.6699,538.79,2650.0,19,497.0,0.5452


In [7]:
df['FICOScore']=df['FICOScore'].fillna(df['FICOScore'].mean())
df['VehicleValueBlackBook']=df['VehicleValueBlackBook'].fillna(df['VehicleValueBlackBook'].mean())

In [8]:
target = df['true_recovery_rate']
df = df.drop(columns = ['true_recovery_rate', 'masked_acct'])
df.insert(0,'true_recovery_rate', target)

In [9]:
Correlation = df.corr()
Correlation['true_recovery_rate'].sort_values(ascending = 0)

true_recovery_rate         1.000000
MoodysUVIDiffCOFromOrig    0.265788
VehicleValueBlackBook      0.213596
FinancedAmt                0.152556
PaymentAmt                 0.133799
TotalDownPmt               0.099443
FICOScore                 -0.023946
ChargeOffMOB              -0.309455
Name: true_recovery_rate, dtype: float64

### 3. Train test split

In [10]:
import numpy as np
df_randomized = df.sample(frac=1, random_state=123)
df_randomized

Unnamed: 0,true_recovery_rate,MoodysUVIDiffCOFromOrig,VehicleValueBlackBook,FinancedAmt,PaymentAmt,TotalDownPmt,ChargeOffMOB,FICOScore
3856,0.3326,-6.59,10800.0,17292.5293,372.73,4500.0,31,521.365109
8983,0.6581,-2.39,14825.0,17969.2891,394.92,2500.0,9,497.000000
4135,0.5997,0.49,10225.0,15160.0000,340.71,1335.0,9,521.365109
4075,0.3489,-5.55,12850.0,18795.6309,459.44,1990.0,17,545.000000
4042,0.0397,-10.49,12825.0,15295.2002,348.00,3500.0,20,590.000000
...,...,...,...,...,...,...,...,...
5218,0.3784,-9.30,14175.0,18250.8008,440.80,5200.0,16,465.000000
12252,0.1252,-3.78,10000.0,15990.0000,381.68,3000.0,9,521.365109
1346,0.7074,-0.79,14675.0,17144.0000,439.89,1000.0,8,477.000000
11646,0.0151,-11.40,4200.0,12672.6104,383.28,1000.0,37,521.365109


In [11]:
train_data, test_data = np.split(df_randomized, [int(0.7*len(df_randomized))])

print(train_data.shape, test_data.shape)

(9602, 8) (4116, 8)


### 3.1 Setting up path to upload the datasets to S3

In [12]:
import os

prefix = 'datasets'

train_csv_path = 's3://{}/{}/{}'.format(bucket_name, prefix, 'train.csv')
test_csv_path = 's3://{}/{}/{}'.format(bucket_name, prefix, 'test.csv')

print(train_csv_path)
print(test_csv_path)

s3://loan-default-predictor/datasets/train.csv
s3://loan-default-predictor/datasets/test.csv


In [13]:
train_data.to_csv(train_csv_path, index = False, header = False)
test_data.to_csv(test_csv_path, index = False, header = False)

In [14]:
X_train = train_data.drop(columns = ['true_recovery_rate'])
y_train = train_data['true_recovery_rate']

X_test = test_data.drop(columns = ['true_recovery_rate'])
y_test = test_data['true_recovery_rate']

In [15]:
X_train.shape, X_test.shape, y_train.shape

((9602, 7), (4116, 7), (9602,))

### 4. Using Linear Regression model

In [16]:
from sklearn.linear_model import LinearRegression

my_model = LinearRegression()
my_model.fit(X_train, y_train)

#### 4.1 Calculating error for Linear Regression model:

In [17]:
y_pred_test = my_model.predict(X_test)

# MAE
e = y_test - y_pred_test
## absolute, mean
MAE = np.sum(np.abs(e))/y_test.shape
np.mean(np.abs(e))

# MSE
MSE = np.mean(e**2)

# RMSE
RMSE = MSE**0.5

MAE, RMSE

(array([0.16228249]), 0.20963028895147168)

### 5. Using Lasso model

In [18]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV

# Grid Search with CV - LASSO Case
lasso_grid = { 'alpha' : [0.0001]}

# 2. Define function and fit the data
# EST
lasso = Lasso()

lasso_cv_reg = GridSearchCV(lasso, lasso_grid, cv = 5)

lasso_cv_reg.fit(X_train, y_train)

#### 5.1 Calculating errors for Lasso

In [19]:
# error
y_test_L1 = lasso_cv_reg.predict(X_test)
e = y_test - y_test_L1

# MSE_l1 = np.mean(e**2)
MSE_lp = np.mean(e**2)

# RMSE
RMSE_l1 = MSE_lp**0.5

RMSE_l1

0.2096303528700987

### 6. Build XGBoost Model

In [20]:
import sagemaker
from sagemaker import image_uris
from sagemaker.session import Session
from sagemaker.inputs import TrainingInput

### 6.1 Selecting the XGBoost container

In [21]:
xgboost_container = image_uris.retrieve("xgboost", boto3.Session().region_name, "1.5-1")

display(xgboost_container)

'683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:1.5-1'

### 6.2 Initializing the hyperparameters

In [22]:
hyperparameters = {
        "max_depth":"5",
        "eta":"0.2",
        "gamma":"4",
        "min_child_weight":"6",
        "subsample":"0.7",
        "objective":"reg:squarederror",
        "num_round":"50"
}

### 6.3 Setting an output path to store the trained model

In [23]:
output_path = 's3://{}/{}/'.format(bucket_name, 'output')

print(output_path)

s3://loan-default-predictor/output/


### 6.4 Constructing a SageMaker estimator that calls the xgboost-container

In [24]:
estimator = sagemaker.estimator.Estimator(image_uri = xgboost_container, 
                                          hyperparameters = hyperparameters,
                                          role = sagemaker.get_execution_role(),
                                          instance_count = 1, 
                                          instance_type = 'ml.m5.large', 
                                          volume_size = 5, # 5 GB 
                                          output_path = output_path,
                                          use_spot_instances = True,
                                          max_run = 300,
                                          max_wait = 600
                                         )

### 6.5 Define the data type and paths to the training and validation datasets

In [25]:
train_input = TrainingInput(
    "s3://{}/{}/{}".format(bucket_name, prefix, "train.csv"), content_type="csv"
)
test_input = TrainingInput(
    "s3://{}/{}/{}".format(bucket_name, prefix, "test.csv"), content_type="csv"
)

### 6.6 Execute the XGBoost training job

In [26]:
estimator.fit({"train": train_input, "validation": test_input}, wait=True)

INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2023-03-15-19-26-47-789


2023-03-15 19:26:47 Starting - Starting the training job...
2023-03-15 19:27:04 Starting - Preparing the instances for training......
2023-03-15 19:27:56 Downloading - Downloading input data...
2023-03-15 19:28:37 Training - Downloading the training image.....[34m[2023-03-15 19:29:27.452 ip-10-0-159-6.ec2.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2023-03-15 19:29:27.526 ip-10-0-159-6.ec2.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2023-03-15:19:29:27:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2023-03-15:19:29:27:INFO] Failed to parse hyperparameter objective value reg:squarederror to Json.[0m
[34mReturning the value itself[0m
[34m[2023-03-15:19:29:27:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2023-03-15:19:29:27:INFO] Running XGBoost Sagemaker in algorithm mode[0m
[34m[2023-03-15:19:29:27:INFO] Determined 0 GPU(s) available on the instance.[0m
[34m[2023-03

### 7. Deploy trained xgb model as Endpoint

In [27]:
from sagemaker.serializers import CSVSerializer

xgb_predictor = estimator.deploy(
    initial_instance_count=1,
    instance_type='ml.t2.medium',
    serializer=CSVSerializer()
)

INFO:sagemaker:Creating model with name: sagemaker-xgboost-2023-03-15-19-30-07-737
INFO:sagemaker:Creating endpoint-config with name sagemaker-xgboost-2023-03-15-19-30-07-737
INFO:sagemaker:Creating endpoint with name sagemaker-xgboost-2023-03-15-19-30-07-737


------------!

In [28]:
xgb_predictor.endpoint_name

'sagemaker-xgboost-2023-03-15-19-30-07-737'

In [40]:
x_new=np.array([[2.39,6100.0,10397.4502,282.2,750.0,15,535.0]])

In [41]:
y_predict=xgb_predictor.predict(x_new).decode('utf-8')
print(y_predict, type(y_predict))

0.3924473822116852
 <class 'str'>


### 8. Lambda function handler

In [50]:
import boto3

ENDPOINT_NAME = 'sagemaker-xgboost-2023-03-15-19-30-07-737'
runtime = boto3.client ('runtime.sagemaker')

def lambda_handler (event, context):
    # inputs = event['data']
    inputs=np.array([[2.39,6100.0,10397.4502,282.2,750.0,15,535.0]])
    
    serialized_input = ','.join(map(str, inputs[0]))
    
    response = runtime.invoke_endpoint(EndpointName=ENDPOINT_NAME,
                                        ContentType='text/csv',
                                        Body=serialized_input)
    
    result = response['Body'].read().decode()
    return result

Input_json = { 'data':
              [[2.39,6100.0,10397.4502,282.2,750.0,15,535.0]]
}

result=lambda_handler(Input_json, __)
result

'0.3924473822116852\n'

### 9. Creating a lambda function

### 10. Testing API Endpoint

In [51]:
# importing the requests library
import requests
# defining the api-endpoint
API_ENDPOINT="https://sfmd4hbbpb.execute-api.us-east-1.amazonaws.com/dev"
# data to be sent to api
json = { "data":
              [[2.39,6100.0,10397.4502,282.2,750.0,15,535.0]],
        "name": 'John'
}
# sending post request and saving response as response object
r = requests.post (url = API_ENDPOINT, json = json)

In [52]:
print(f"Status Code: {r.status_code}, Response: {r.json ()}")

Status Code: 200, Response: 0.3924473822116852

