# LendingGenie Model implementation in SageMaker

>* This file is used to implemented shortlisted model in the SageMaker for AWS deployment
>* Model is shortlisted based on analysis performed in 'lending-genie.ipynb' file

Presently the shortlisted model happens to be 'LinearSVC with balanced weights'

In [None]:
import boto3
import io
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
import sagemaker
import sagemaker.amazon.common as smac
from sagemaker import get_execution_role
from sagemaker.predictor import csv_serializer, json_deserializer
import smutil

In [3]:
#! aws s3 cp file_from_path file_to_path

# Read the data from AWS S3

In [4]:
df = pd.read_csv(
     's3://sagemaker-us-east-1-831850195362/archive/accepted_2007_to_2018q4.csv/accepted_2007_to_2018Q4.csv',
     low_memory=False,
     chunksize=10000
)


In [5]:
bank1_df = pd.DataFrame(df.get_chunk(1200000))

In [6]:
bank1_df.shape

(1200000, 151)

# Data clean up and preparation

>* Non-predictive columns are dropped out
>* Rows with null values are filled with 0 (in this case, there were no null values)
>* Inhouse special encoding is performed to avoid creating more features using OneHotEncoder or get_dummies
>* Data is scaled using StandardScaler
>* PCA analysis is performed to reduce dimensions
>* Data is split into Train and Test sets

In [7]:
# drop prescribed columns suggested by analysis performed in other parts
bank1_df = bank1_df.drop(['id',
                          'member_id',
                          'issue_d',
                          'url',
                          'zip_code',
                          'initial_list_status',
                          'hardship_flag',
                          'hardship_type',
                          'hardship_reason',
                          'hardship_status',
                          'deferral_term',
                          'hardship_amount',
                          'hardship_start_date',
                          'hardship_end_date',
                          'payment_plan_start_date',
                          'hardship_length',
                          'hardship_dpd',
                          'hardship_loan_status',
                          'orig_projected_additional_accrued_interest',
                          'hardship_payoff_balance_amount',
                          'hardship_last_payment_amount',
                          'disbursement_method',
                          'debt_settlement_flag',
                          'debt_settlement_flag_date',
                          'settlement_status',
                          'settlement_date',
                          'settlement_amount',
                          'settlement_percentage',
                          'settlement_term',
                          'pymnt_plan','desc',
                          'purpose',
                          'title',
                          'last_pymnt_d',
                          'next_pymnt_d',
                          'earliest_cr_line',
                          'last_credit_pull_d',
                          'sec_app_earliest_cr_line',
                          'emp_title'], 
                         axis=1)
display(bank1_df.shape)

(1200000, 112)

In [8]:
# replace or fill NaN with 0 and confirm
bank1_df = bank1_df.fillna(0.0)
bank1_df.reset_index()
bank1_df.head()

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_length,home_ownership,...,sec_app_fico_range_high,sec_app_inq_last_6mths,sec_app_mort_acc,sec_app_open_acc,sec_app_revol_util,sec_app_open_act_il,sec_app_num_rev_accts,sec_app_chargeoff_within_12_mths,sec_app_collections_12_mths_ex_med,sec_app_mths_since_last_major_derog
0,3600.0,3600.0,3600.0,36 months,13.99,123.03,C,C4,10+ years,MORTGAGE,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,24700.0,24700.0,24700.0,36 months,11.99,820.28,C,C1,10+ years,MORTGAGE,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,20000.0,20000.0,20000.0,60 months,10.78,432.66,B,B4,10+ years,MORTGAGE,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,35000.0,35000.0,35000.0,60 months,14.85,829.9,C,C5,10+ years,MORTGAGE,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,10400.0,10400.0,10400.0,60 months,22.45,289.91,F,F1,3 years,MORTGAGE,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
# encode data using methods analysed in other modules
numerize_wilco_meth = {'term': {' 36 months': 1.0, ' 60 months': 2.0},
                    'grade': {'A': 1.0, 'B': 2.0, 'C': 3.0, 'D': 4.0, 'E': 5.0,
                              'F': 6.0, 'G': 7.0,
                             },
                    'sub_grade': {'A1': 1.0, 'A2': 2.0, 'A3': 3.0, 'A4': 4.0, 'A5': 5.0,
                                  'B1': 11.0, 'B2': 12.0, 'B3': 13.0, 'B4': 14.0, 'B5': 15.0,
                                  'C1': 21.0, 'C2': 22.0, 'C3': 23.0, 'C4': 24.0, 'C5': 25.0,
                                  'D1': 31.0, 'D2': 32.0, 'D3': 33.0, 'D4': 34.0, 'D5': 35.0,
                                  'E1': 41.0, 'E2': 42.0, 'E3': 43.0, 'E4': 44.0, 'E5': 45.0,
                                  'F1': 51.0, 'F2': 52.0, 'F3': 53.0, 'F4': 54.0, 'F5': 55.0,
                                  'G1': 61.0, 'G2': 62.0, 'G3': 63.0, 'G4': 64.0, 'G5': 65.0,
                                    },
                    'addr_state': {'CA': 1.0, 'NY': 2.0, 'TX': 3.0, 'FL': 4.0, 'IL': 5.0,
                                   'NJ': 11.0, 'PA': 12.0, 'OH': 13.0, 'GA': 14.0, 'VA': 15.0,
                                   'NC': 21.0, 'MI': 22.0, 'MD': 23.0, 'AZ': 24.0, 'MA': 25.0,
                                   'CO': 31.0, 'WA': 32.0, 'MN': 33.0, 'IN': 34.0, 'MO': 35.0,
                                   'CT': 41.0, 'TN': 42.0, 'NV': 43.0, 'WI': 44.0, 'SC': 45.0,
                                   'AL': 51.0, 'OR': 52.0, 'LA': 53.0, 'KY': 54.0, 'OK': 55.0,
                                   'KS': 61.0, 'AR': 62.0, 'UT': 63.0, 'MS': 64.0, 'NM': 65.0,
                                   'NH': 71.0, 'HI': 72.0, 'RI': 73.0, 'WV': 74.0, 'NE': 75.0,
                                   'DE': 81.0, 'MT': 82.0, 'DC': 83.0, 'AK': 84.0, 'ME': 85.0,
                                   'VT': 91.0, 'WY': 92.0, 'SD': 93.0, 'ID': 94.0, 'ND': 95.0,
                                   'IA': 101.0,
                                  },'emp_length': {'< 1 year': 0.0, '1 year': 1.0, '2 years': 2.0, '3 years': 3.0, '4 years': 4.0,
                                   '5 years': 5.0, '6 years': 6.0, '7 years': 7.0, '8 years': 8.0, '9 years': 9.0,
                                   '10+ years': 10.0,
                                  },
                    'loan_status': {'Fully Paid': 1.0, 'Current': 1.0, 'Does not meet the credit policy. Status:Fully Paid': 1.0,
                                    'Charged Off': 0.0, 'In Grace Period': 0.0, 'Late (16-30 days)': 0.0, 'Late (31-120 days)': 0.0,
                                    'Default': 0.0, 'Does not meet the credit policy. Status:Charged Off': 0.0,
                                   },
                    'home_ownership': {'NONE': 0.0, 'RENT': 1.0, 'ANY': 2.0, 'MORTGAGE': 3.0, 'OWN': 4.0,
                                       'OTHER': 5.0,
                                      },
                    'application_type': {'Individual': 1.0, 'Joint App': 2.0,
                                       },
                    'verification_status': {'Not Verified': 0.0, 'Verified': 1.0, 'Source Verified': 2.0,
                                           },
                    'verification_status_joint': {'Not Verified': 0.0, 'Verified': 1.0, 'Source Verified': 2.0,
                                                 },
                   }
bank1_df = bank1_df.replace(numerize_wilco_meth)


In [10]:
# Define bank1_df target set y using the 'loan_status' column
y = bank1_df['loan_status']
# Display a sample of target b1y


In [11]:
SEED = 42 

In [12]:
# get all the features in X
X = bank1_df.drop(columns = 'loan_status')

In [13]:
# perform sclaing using StandardScaler
X_scaled = StandardScaler().fit_transform(X)

In [15]:
# Perform PCA on the StandardScaler() features data set X_scaled
pca = PCA().fit(X_scaled)
X_scaled_pca = PCA(n_components = 0.95, random_state = SEED)
X_scaled = X_scaled_pca.fit_transform(X_scaled)

In [16]:
# Full PCA dataset
# Split the bank1-df StandardScaler scaled PCA data into training and testing datasets using train_test_split().

X_train_scaled, X_test_scaled, y_train, y_test = train_test_split(
    X_scaled,y, 
    test_size=0.25,                # 75% train : 25% test split
    random_state = SEED
)

In [17]:
data_dictionary = {
    "X_train" : X_train_scaled,
    "X_test" : X_test_scaled,
    "y_train" : y_train,
    "y_test" : y_test
}

In [18]:
print(X_train_scaled.shape[1])

62


# Define variables used by AWS

In [19]:
bucket = "fintechtestbucket-pp"
prefix = "lendinggenie/linear-learner"
role = get_execution_role()
model_type = "linear-learner"
instance_type = "ml.m4.xlarge"
n_features = (X_train_scaled.shape[1])

n_epochs = 100 # put here number of epochs

# Store data in S3 bucket after enconding data in Protocol Buffer format

In [20]:
# store data in S3 buckets
s3_train_data,s3_test_data = smutil.store_data_s3(bucket, prefix, role, data_dictionary)

train data uploaded to: s3://fintechtestbucket-pp/lendinggenie/linear-learner/train/model_train.data
test data uploaded to: s3://fintechtestbucket-pp/lendinggenie/linear-learner/test/model_test.data


# Define hyper paramters used for various models

In [21]:
# set hyper parameters in this section
# for logistic regression: 
"""l_hyperparams = {
    "feature_dim": n_features,
    "predictor_type": "binary_classifier",
    "epochs": n_epochs,
}
        
# Binary classifier with automated threshold tuning
le_hyperparams = {
    "feature_dim": n_features,
    "predictor_type": "binary_classifier",
    "binary_classifier_model_selection_criteria": "precision_at_target_recall",
    "target_recall": 0.9,
    "epochs": n_epochs,
}

# Binary classifier with class weights and automated threshold tuning
leb_hyperparams = {
    "feature_dim": n_features,
    "predictor_type": "binary_classifier",
    "binary_classifier_model_selection_criteria": "precision_at_target_recall",
    "target_recall": 0.9,
    "positive_example_weight_mult": "balanced",
    "epochs": n_epochs,
}

# Linear SVM
lsvc_hyperparams = {
    "feature_dim": n_features,
    "predictor_type": "binary_classifier",
    "loss": "hinge_loss",
    "binary_classifier_model_selection_criteria": "precision_at_target_recall",
    "target_recall": 0.9,
    "epochs": n_epochs,
}"""

# Linear SVM with balanced class weights
lsvcb_hyperparams = {
    "feature_dim": n_features,
    "predictor_type": "binary_classifier",
    "loss": "hinge_loss",
    "binary_classifier_model_selection_criteria": "precision_at_target_recall",
    "target_recall": 0.9,
    "positive_example_weight_mult": "balanced",
    "epochs": n_epochs,
}


# Create model using image container

In [22]:
# create model learners in this section
lsvcb_model = smutil.create_model(bucket, prefix, role,model_type, instance_type,lsvcb_hyperparams)

The method get_image_uri has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


# Train the model using available dataset and deploy the model

In [23]:
# fit the model
lsvcb_model.fit({"train": s3_train_data, "test": s3_test_data})
lsvcb_predictor = lsvcb_model.deploy(initial_instance_count=1, instance_type=instance_type)

lsvcb_predictor.serializer = csv_serializer
lsvcb_predictor.deserializer = json_deserializer


2022-12-15 07:45:41 Starting - Starting the training job...ProfilerReport-1671090341: InProgress
...
2022-12-15 07:46:28 Starting - Preparing the instances for training...............
2022-12-15 07:49:12 Downloading - Downloading input data.........
2022-12-15 07:50:29 Training - Training image download completed. Training in progress.[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[12/15/2022 07:50:29 INFO 140077352638272] Reading default configuration from /opt/amazon/lib/python3.7/site-packages/algorithm/resources/default-input.json: {'mini_batch_size': '1000', 'epochs': '15', 'feature_dim': 'auto', 'use_bias': 'true', 'binary_classifier_model_selection_criteria': 'accuracy', 'f_beta': '1.0', 'target_recall': '0.8', 'target_precision': '0.8', 'num_models': 'auto', 'num_calibration_samples': '10000000', 'init_method': 'uniform', 'init_scale': '0.07', 'init_sigma': '0.01', 'init_bias': '0.0', 'optimizer': 'auto',

In [24]:
print(lsvcb_predictor)

<sagemaker.predictor.Predictor object at 0x7f747690c550>


In [83]:
X_test_scaled.shape

(300000, 62)

# Split the test data into smaller chunks

In [84]:
new_X = np.array_split(X_test_scaled, 100)

In [85]:
new_y = np.array_split(y_test, 100)

In [86]:
new_X[1].shape

(3000, 62)

In [87]:
new_y[1].shape

(3000,)

# Iterate over the smaller test dataset for prediction and evaulation

In [88]:
metrics_list = []

In [89]:
for i in range(1,100):
   metrics_list.append(smutil.evaluate(lsvcb_predictor, new_X[i], new_y[i], "Linear SVC with class weights", False))

The csv_serializer has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
The json_deserializer has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
The csv_serializer has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
The json_deserializer has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
The csv_serializer has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
The json_deserializer has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
The csv_serializer has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
The json_deserializer has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
The csv_seri

In [90]:
#metrics_list

In [91]:
pd.set_option("display.float_format", lambda x: "%.2f" % x)

In [92]:
df = pd.DataFrame(metrics_list)

In [96]:
df['Recall2'] = df['TN'] / (df['TN']+df['FP'])
df['Precision2'] = df['TN'] / (df['TN']+df['FN'])

# Check the model evaluation metrics

In [97]:
df.loc[:,["Model", "Recall", "Precision", "Recall2", "Precision2","Accuracy", "F1"]].mean()

  """Entry point for launching an IPython kernel.


Recall       0.92
Precision    0.99
Recall2      0.96
Precision2   0.68
Accuracy     0.93
F1           0.96
dtype: float64

# Close the sagemaker endpoint

In [94]:
smutil.delete_endpoint(lsvcb_predictor)

The endpoint attribute has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
The endpoint attribute has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


Deleted linear-learner-2022-12-15-07-57-52-588
