In [1]:
import sagemaker,boto3,os
import pandas as pd
import numpy as np
from sagemaker import get_execution_role
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.compose import ColumnTransformer 
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [2]:
role = get_execution_role()
print(role)

arn:aws:iam::616166080248:role/service-role/AmazonSageMaker-ExecutionRole-20220105T164979


In [3]:
conn = boto3.client('s3')
bucket = "ml-case-study-data"
contents = conn.list_objects(Bucket=bucket, Prefix="")['Contents']
for f in contents:
    print(f['Key'])

/train/dataset.csv
dataset.csv
default_predictor_05012022.pkl
output/xgboost-2022-01-06-03-03-14-555/profiler-output/system/incremental/2022010603/1641438360.algo-1.json
output/xgboost-2022-01-06-03-03-14-555/profiler-output/system/incremental/2022010603/1641438420.algo-1.json
output/xgboost-2022-01-06-10-44-05-743/profiler-output/system/incremental/2022010610/1641465960.algo-1.json
output/xgboost-2022-01-06-10-44-05-743/profiler-output/system/incremental/2022010610/1641466020.algo-1.json
sagemaker/defaulter-prediction-xgboost/output/xgboost-2022-01-06-12-53-29-685/profiler-output/system/incremental/2022010612/1641473760.algo-1.json
sagemaker/defaulter-prediction-xgboost/output/xgboost-2022-01-06-14-07-20-768/profiler-output/system/incremental/2022010614/1641478200.algo-1.json
sagemaker/defaulter-prediction-xgboost/output/xgboost-2022-01-06-14-07-20-768/profiler-output/system/incremental/2022010614/1641478260.algo-1.json
sagemaker/defaulter-prediction-xgboost/train/train.csv
train.csv


In [4]:
dataFile = "dataset.csv"
response = conn.get_object(Bucket=bucket, Key=dataFile)
body = response['Body']
full_data = pd.read_csv(body,sep=";",index_col=0)

In [5]:
full_data.columns

Index(['default', 'account_amount_added_12_24m', 'account_days_in_dc_12_24m',
       'account_days_in_rem_12_24m', 'account_days_in_term_12_24m',
       'account_incoming_debt_vs_paid_0_24m', 'account_status',
       'account_worst_status_0_3m', 'account_worst_status_12_24m',
       'account_worst_status_3_6m', 'account_worst_status_6_12m', 'age',
       'avg_payment_span_0_12m', 'avg_payment_span_0_3m', 'merchant_category',
       'merchant_group', 'has_paid', 'max_paid_inv_0_12m',
       'max_paid_inv_0_24m', 'name_in_email',
       'num_active_div_by_paid_inv_0_12m', 'num_active_inv',
       'num_arch_dc_0_12m', 'num_arch_dc_12_24m', 'num_arch_ok_0_12m',
       'num_arch_ok_12_24m', 'num_arch_rem_0_12m',
       'num_arch_written_off_0_12m', 'num_arch_written_off_12_24m',
       'num_unpaid_bills', 'status_last_archived_0_24m',
       'status_2nd_last_archived_0_24m', 'status_3rd_last_archived_0_24m',
       'status_max_archived_0_6_months', 'status_max_archived_0_12_months',
       

In [6]:
# Splitting the dataset for training and actual data based on the availability of the target variable
train_data = full_data.loc[(full_data['default'] == 0.0) | (full_data['default'] == 1.0)]
pred_data = full_data.loc[(full_data['default'] != 0.0) & (full_data['default'] != 1.0)]

In [7]:
#Initialize a list of features to be dropped

#UUID is a unique identifier and may not help in the actual prediction.
#drop_features = ["uuid"]

# dropping columns with more than 49 percentage of missing data (found during EDA)
drop_features = ['account_incoming_debt_vs_paid_0_24m',
 'account_status',
 'account_worst_status_0_3m',
 'account_worst_status_12_24m',
 'account_worst_status_3_6m',
 'account_worst_status_6_12m',
 'avg_payment_span_0_3m',
 'worst_status_active_inv']

In [8]:
# categorical features as mentioned in the data dictionary available in the case study document

categorical_features = ["account_status","account_worst_status_0_3m","account_worst_status_12_24m","account_worst_status_3_6m",
                       "account_worst_status_6_12m","merchant_category","merchant_group","has_paid","name_in_email","status_last_archived_0_24m",
                       "status_2nd_last_archived_0_24m","status_3rd_last_archived_0_24m","status_max_archived_0_6_months","status_max_archived_0_12_months",
                       "status_max_archived_0_24_months","worst_status_active_inv"]

In [9]:
# columns with missing data in the training set, all of them found to be numerical columns as per data dictionary

missing_data_columns = ['account_days_in_dc_12_24m', 'account_days_in_rem_12_24m',
       'account_days_in_term_12_24m', 'avg_payment_span_0_12m',
       'num_active_div_by_paid_inv_0_12m', 'num_arch_written_off_0_12m',
       'num_arch_written_off_12_24m']

In [11]:
X = train_data.drop(['default'], axis=1)
y = train_data['default']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=7)

In [12]:
missing_data_transformer = Pipeline(steps=[
                                     ('meanimputer', SimpleImputer(strategy='mean')),
                                     ('stdscaler', StandardScaler())
                                     ])

categorical_transformer = Pipeline(steps=[
                                         ('onehotenc', OneHotEncoder(handle_unknown='ignore'))
                                         ])
column_transformer = ColumnTransformer(transformers=[('drop_columns', 'drop', drop_features),
                                                   ('numeric_processing',missing_data_transformer, missing_data_columns),
                                                    ('categorical_processing', categorical_transformer, categorical_features)
                                                  ], remainder='drop')

In [17]:
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
def confusion_heatMap(confusionMatrix):
  class_names=[0,1]
  fig, ax = plt.subplots()
  tick_marks = np.arange(len(class_names))
  plt.xticks(tick_marks, class_names)
  plt.yticks(tick_marks, class_names)
  sns.heatmap(pd.DataFrame(confusionMatrix), annot=True, cmap="YlGnBu" ,fmt='g')
  ax.xaxis.set_label_position("top")
  plt.tight_layout()
  plt.title('Confusion matrix', y=1.1)
  plt.ylabel('Actual label')
  plt.xlabel('Predicted label')

In [22]:
preprocessor_pipeline = Pipeline([
                     ('transform_column', column_transformer),
                    ])
preprocessor_pipeline.fit(X)

Pipeline(steps=[('transform_column',
                 ColumnTransformer(transformers=[('drop_columns', 'drop',
                                                  ['account_incoming_debt_vs_paid_0_24m',
                                                   'account_status',
                                                   'account_worst_status_0_3m',
                                                   'account_worst_status_12_24m',
                                                   'account_worst_status_3_6m',
                                                   'account_worst_status_6_12m',
                                                   'avg_payment_span_0_3m',
                                                   'worst_status_active_inv']),
                                                 ('numeric_processing',
                                                  Pipeline(steps...
                                                   'account_worst_status_12_24m',
                             

In [24]:
X.shape

(89976, 41)

In [25]:
X.columns

Index(['account_amount_added_12_24m', 'account_days_in_dc_12_24m',
       'account_days_in_rem_12_24m', 'account_days_in_term_12_24m',
       'account_incoming_debt_vs_paid_0_24m', 'account_status',
       'account_worst_status_0_3m', 'account_worst_status_12_24m',
       'account_worst_status_3_6m', 'account_worst_status_6_12m', 'age',
       'avg_payment_span_0_12m', 'avg_payment_span_0_3m', 'merchant_category',
       'merchant_group', 'has_paid', 'max_paid_inv_0_12m',
       'max_paid_inv_0_24m', 'name_in_email',
       'num_active_div_by_paid_inv_0_12m', 'num_active_inv',
       'num_arch_dc_0_12m', 'num_arch_dc_12_24m', 'num_arch_ok_0_12m',
       'num_arch_ok_12_24m', 'num_arch_rem_0_12m',
       'num_arch_written_off_0_12m', 'num_arch_written_off_12_24m',
       'num_unpaid_bills', 'status_last_archived_0_24m',
       'status_2nd_last_archived_0_24m', 'status_3rd_last_archived_0_24m',
       'status_max_archived_0_6_months', 'status_max_archived_0_12_months',
       'status_max

In [31]:
preprocessor_pipeline.transform(X)

<89976x144 sparse matrix of type '<class 'numpy.float64'>'
	with 1983321 stored elements in Compressed Sparse Row format>

In [23]:
import joblib
joblib.dump(preprocessor_pipeline,"preprocessor_pipeline.pkl")

['preprocessor_pipeline.pkl']