Data pre-processing:

In [1]:
import boto3
import sagemaker
from sagemaker import get_execution_role
from sagemaker.sklearn.processing import SKLearnProcessor

region = boto3.session.Session().region_name

role = get_execution_role()
sklearn_processor = SKLearnProcessor(
    framework_version="0.20.0", role=role, instance_type="ml.m5.xlarge", instance_count=1
)

In [2]:
import pandas as pd

input_data = "s3://loan-repayment-prediction-bda-project/loans.csv".format(region)
df = pd.read_csv(input_data, nrows=10)
df.head(n=10)

Unnamed: 0,id,loan_status,loan_amount,funded_amount_by_investors,loan_term,interest_rate,installment,grade,sub_grade,verification_status,...,earliest_credit_line,inquiries_last_6_months,open_credit_lines,derogatory_public_records,revolving_line_utilization_rate,total_credit_lines,employment_length,employer_title,home_ownership,annual_income
0,1077501,fully paid,5000,4975,36,10.65,162.87,b,b2,verified,...,1/1/1985,1,3,0,83.7,9,10,,rent,24000
1,1077430,charged off,2500,2500,60,15.27,59.83,c,c4,source verified,...,4/1/1999,5,3,0,9.4,4,1,ryder,rent,30000
2,1077175,fully paid,2400,2400,36,15.96,84.33,c,c5,not verified,...,11/1/2001,2,2,0,98.5,10,10,,rent,12252
3,1076863,fully paid,10000,10000,36,13.49,339.31,c,c1,source verified,...,2/1/1996,1,10,0,21.0,37,10,air resources board,rent,49200
4,1075358,current,3000,3000,60,12.69,67.79,b,b5,source verified,...,1/1/1996,0,15,0,53.9,38,1,university medical group,rent,80000
5,1075269,fully paid,5000,5000,36,7.9,156.46,a,a4,source verified,...,11/1/2004,3,9,0,28.3,12,3,veolia transportaton,rent,36000
6,1069639,fully paid,7000,7000,60,15.96,170.08,c,c5,not verified,...,7/1/2005,1,7,0,85.6,11,8,southern star photography,rent,47004
7,1072053,fully paid,3000,3000,36,18.64,109.43,e,e1,source verified,...,1/1/2007,2,4,0,87.5,4,9,mkc accounting,rent,48000
8,1071795,charged off,5600,5600,60,21.28,152.39,f,f2,source verified,...,4/1/2004,2,11,0,32.6,13,4,,own,40000
9,1071570,charged off,5375,5350,60,12.69,121.45,b,b5,verified,...,9/1/2004,0,2,0,36.5,3,1,starbucks,rent,15000


In [30]:
import numpy as np
from sklearn.model_selection import train_test_split

columns = [
    "id",
    "loan_amount",
    "funded_amount_by_investors",
    "loan_term",
    "interest_rate",
    "installment",
    "grade",
    "sub_grade",
    "verification_status",
    "issued_on",
    "purpose",
    "dti",
    "earliest_credit_line",
    "inquiries_last_6_months",
    "open_credit_lines",
    "derogatory_public_records",
    "revolving_line_utilization_rate",
    "total_credit_lines",
    "employment_length",
    "employer_title",
    "home_ownership",
    "annual_income",
    "loan_status"
]
class_labels = ["fully paid", "charged off", "current"]

In [28]:
df.drop_duplicates(inplace=True)
df.replace(class_labels, [0, 1, 2], inplace=True)

In [29]:
split_ratio = 0.25
print("Split data into train and test sets with ratio {}".format(split_ratio))
X_train, X_test, y_train, y_test = train_test_split(
    df.drop("loan_status", axis=1), df["loan_status"], test_size=split_ratio, random_state=0
)

Split data into train and test sets with ratio 0.25


Data dimensions:

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10 entries, 0 to 9
Data columns (total 23 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   id                               10 non-null     int64  
 1   loan_status                      10 non-null     int64  
 2   loan_amount                      10 non-null     int64  
 3   funded_amount_by_investors       10 non-null     int64  
 4   loan_term                        10 non-null     int64  
 5   interest_rate                    10 non-null     float64
 6   installment                      10 non-null     float64
 7   grade                            10 non-null     object 
 8   sub_grade                        10 non-null     object 
 9   verification_status              10 non-null     object 
 10  issued_on                        10 non-null     object 
 11  purpose                          10 non-null     object 
 12  dti                      

In [27]:
df.shape

(10, 23)