# 2. Feature Engineering

## 2.1 Encoding Categorical Features

In [36]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

# Boto3 is the Amazon Web Services (AWS) Software Development Kit (SDK) for Python
# Boto3 allows Python developer to write software that makes use of services like Amazon S3 and Amazon EC2
import sagemaker
import boto3
from sagemaker.amazon.amazon_estimator import get_image_uri

# 500 rows and 200 columns will be presented
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 500)

In [2]:
creditcard_df = pd.read_csv('UCI_Credit_Card.csv')
creditcard_df.drop(['ID'], axis = 1, inplace = True)

In [3]:
creditcard_df.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,20000.0,2,2,1,24,2,2,-1,-1,-2,-2,3913.0,3102.0,689.0,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,120000.0,2,2,2,26,-1,2,0,0,0,2,2682.0,1725.0,2682.0,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1
2,90000.0,2,2,2,34,0,0,0,0,0,0,29239.0,14027.0,13559.0,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,50000.0,2,2,1,37,0,0,0,0,0,0,46990.0,48233.0,49291.0,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0
4,50000.0,1,2,1,57,-1,0,-1,0,0,0,8617.0,5670.0,35835.0,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0


In [4]:
X_cat = creditcard_df[['SEX', 'EDUCATION', 'MARRIAGE']]

In [5]:
onehotencoder = OneHotEncoder()
X_cat = onehotencoder.fit_transform(X_cat).toarray()

In [6]:
X_cat.shape

(30000, 13)

In [7]:
X_cat = pd.DataFrame(X_cat)

In [8]:
X_numerical = creditcard_df[['LIMIT_BAL', 'AGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 
                'BILL_AMT1','BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6',
                'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']]

In [9]:
X = pd.concat([X_cat, X_numerical], axis = 1)

## 2.2 Feature Scaling

In [10]:
numerical_columns = ['LIMIT_BAL', 'AGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 
                'BILL_AMT1','BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6',
                'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']

In [12]:
# Step 1: Apply Log Transformation (Add 1 to avoid log(0) issue)
X_log = X[numerical_columns].apply(lambda x: np.log(x + 1)) 
X_log.replace([np.inf, -np.inf], np.nan, inplace=True)

# Step 2: Apply Standardization
scaler = StandardScaler()
X_standardized = pd.DataFrame(scaler.fit_transform(X_log), columns=numerical_columns)

X[numerical_columns] = X_standardized

In [13]:
X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,LIMIT_BAL,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6
0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,-1.869589,-1.421936,1.819879,1.778224,,,,,-0.311671,-0.309853,-0.703149,-2.527426,-2.412934,-2.254681,-2.039844,-0.007918,-1.875766,-1.789384,-1.751539,-1.681626
1,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.034257,-1.107148,,1.778224,-0.511674,-0.455726,-0.412696,2.156054,-0.436338,-0.49067,-0.300667,-0.191475,-0.122894,-0.080164,-2.039844,0.105561,0.186755,0.244672,-1.751539,0.472947
2,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-0.271426,-0.045689,-0.643146,-0.525862,-0.511674,-0.455726,-0.412696,-0.429351,0.352221,0.155222,0.179521,0.234759,0.288728,0.339607,0.214077,0.229126,0.186755,0.244672,0.254523,0.732579
3,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,-0.895987,0.290684,-0.643146,-0.525862,-0.511674,-0.455726,-0.412696,-0.429351,0.508843,0.535933,0.562038,0.43128,0.474585,0.512157,0.298869,0.3197,0.241135,0.272706,0.273879,0.27662
4,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,-0.895987,2.020263,,-0.525862,,-0.455726,-0.412696,-0.429351,-0.051099,-0.12397,0.46755,0.344208,0.358291,0.395327,0.298869,1.20395,0.873894,0.891311,0.146489,0.167023


In [14]:
y = creditcard_df['default.payment.next.month']

# 3. Modeling & Evaluation

In [22]:
X = np.array(X)
y = np.array(y)

In [23]:
# reshaping the array from (30000,) to (30000, 1)
y = np.array(y)
y = y.reshape(-1,1)
y.shape

(30000, 1)

In [24]:
# spliting the data into training, testing and validation sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size = 0.5)

In [25]:
# Convert the array into dataframe in a way that target variable is set as the first column and followed by feature columns
# This is because sagemaker built-in algorithm expects the data in this format.

train_data = pd.DataFrame({'Target': y_train[:,0]})
for i in range(X_train.shape[1]):
    train_data[i] = X_train[:,i]

In [27]:
val_data = pd.DataFrame({'Target':y_test[:,0]})
for i in range(X_test.shape[1]):
    val_data[i] = X_test[:,i]

In [29]:
# save train_data and validation_data as csv files.

train_data.to_csv('train.csv', header = False, index = False)
val_data.to_csv('validation.csv', header = False, index = False)

In [31]:
# Create a sagemaker session
sagemaker_session = sagemaker.Session()

# S3 bucket and prefix that we want to use
# default_bucket - creates a Amazon S3 bucket to be used in this session
bucket = sagemaker_session.default_bucket()                    # Set a default S3 bucket
prefix = 'XGBoost-classifier'
key = 'XGBoost-classifier'
#Roles give learning and hosting access to the data
#This is specified while opening the sagemakers instance in "Create an IAM role"
role = sagemaker.get_execution_role()

In [32]:
print(role)

arn:aws:iam::339712697475:role/SageMakerExecutionRole


In [33]:
# read the data from csv file and then upload the data to s3 bucket
import os
with open('train.csv','rb') as f:
    # The following code uploads the data into S3 bucket to be accessed later for training
    boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', key)).upload_fileobj(f)

# Let's print out the training data location in s3
s3_train_data = 's3://{}/{}/train/{}'.format(bucket, prefix, key)
print('uploaded training data location: {}'.format(s3_train_data))

uploaded training data location: s3://sagemaker-us-east-1-339712697475/XGBoost-classifier/train/XGBoost-classifier


In [34]:
# read the data from csv file and then upload the data to s3 bucket

with open('validation.csv','rb') as f:
    # The following code uploads the data into S3 bucket to be accessed later for training
    boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'validation', key)).upload_fileobj(f)
    
# Let's print out the validation data location in s3
s3_validation_data = 's3://{}/{}/validation/{}'.format(bucket, prefix, key)
print('uploaded validation data location: {}'.format(s3_validation_data))

uploaded validation data location: s3://sagemaker-us-east-1-339712697475/XGBoost-classifier/validation/XGBoost-classifier


In [35]:
# create output placeholder in S3 bucket to store the output
output_location = 's3://{}/{}/output'.format(bucket, prefix)
print('training artifacts will be uploaded to: {}'.format(output_location))

training artifacts will be uploaded to: s3://sagemaker-us-east-1-339712697475/XGBoost-classifier/output


In [37]:
# This code is used to get the training container of sagemaker built-in algorithms
# all we have to do is to specify the name of the algorithm, that we want to use

# Let's obtain a reference to the XGBoost container image
# Note that all regression models are named estimators
# You don't have to specify (hardcode) the region, get_image_uri will get the current region name using boto3.Session

container = sagemaker.image_uris.retrieve("xgboost", boto3.Session().region_name, version='1.0-1')

In [38]:
# Specify the type of instance that we would like to use for training 
# output path and sagemaker session into the Estimator. 
# We can also specify how many instances we would like to use for training

# Recall that XGBoost works by combining an ensemble of weak models to generate accurate/robust results. 
# The weak models are randomized to avoid overfitting

# num_round: The number of rounds to run the training.
# Alpha: L1 regularization term on weights. Increasing this value makes models more conservative.
# eta: Step size shrinkage used in updates to prevent overfitting. 

Xgboost_classifier = sagemaker.estimator.Estimator(container,
                                                   role, 
                                                   instance_count = 1, 
                                                   instance_type = 'ml.m4.xlarge',
                                                   output_path = output_location,
                                                   sagemaker_session = sagemaker_session)

# To improve the performance of the model, a hyperparameters tuning job need to be run 

Xgboost_classifier.set_hyperparameters(max_depth = 20,
                                       objective = 'multi:softmax',
                                       num_class = 2,
                                       eta = 0.1,
                                       num_round = 150)

In [39]:
# Creating "train", "validation" channels to feed in the model
train_input = sagemaker.session.TrainingInput(s3_data = s3_train_data, content_type='csv',s3_data_type = 'S3Prefix')
valid_input = sagemaker.session.TrainingInput(s3_data = s3_validation_data, content_type='csv',s3_data_type = 'S3Prefix')

data_channels = {'train': train_input,'validation': valid_input}
Xgboost_classifier.fit(data_channels)

INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2024-09-03-22-35-51-132


2024-09-03 22:35:56 Starting - Starting the training job...
2024-09-03 22:36:11 Starting - Preparing the instances for training...
2024-09-03 22:36:42 Downloading - Downloading input data...
2024-09-03 22:37:17 Downloading - Downloading the training image......
2024-09-03 22:37:58 Training - Training image download completed. Training in progress...[34m[2024-09-03 22:38:11.930 ip-10-0-102-16.ec2.internal:8 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value multi:softmax to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[