# Evaluating a Machine Model in SageMaker Studio

In [13]:
# Initial imports
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler

## Load the Data into Pandas

In [14]:
# Load the CSV data into a DataFrame
file_path = Path("../Resources/german_credit_data.csv")
df = pd.read_csv(file_path)

# Display sample data
df.head()

Unnamed: 0,Age,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,22,2,own,little,moderate,5951,48,radio/TV,bad
1,45,2,free,little,little,7882,42,furniture/equipment,good
2,53,2,free,little,little,4870,24,car,bad
3,35,3,rent,little,moderate,6948,36,car,good
4,28,3,own,little,moderate,5234,30,car,bad


## Data Preprocessing

### Encode categorical variables using one-hot encoding

In [15]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

In [16]:
# Create a list of categorical variables
categorical_variables = ["Housing", "Saving accounts", "Checking account", "Purpose", "Risk"]

In [17]:
# Encode categorical variables using OneHotEncoder
encoded_data = enc.fit_transform(df[categorical_variables])

In [18]:
# Create a DataFrame with the encoded variables
encoded_df = pd.DataFrame(
    encoded_data,
    columns = enc.get_feature_names(categorical_variables)
)

# Display sample data
encoded_df.head()

Unnamed: 0,Housing_free,Housing_own,Housing_rent,Saving accounts_little,Saving accounts_moderate,Saving accounts_quite rich,Saving accounts_rich,Checking account_little,Checking account_moderate,Checking account_rich,Purpose_business,Purpose_car,Purpose_domestic appliances,Purpose_education,Purpose_furniture/equipment,Purpose_radio/TV,Purpose_repairs,Purpose_vacation/others,Risk_bad,Risk_good
0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [19]:
# Add the numerical variables from the original DataFrame to the one-hot encoding DataFrame
encoded_df = pd.concat(
    [
        df[["Age", "Job", "Credit amount", "Duration"]],
        encoded_df
    ],
    axis=1
)

# Display sample data
encoded_df.head()

Unnamed: 0,Age,Job,Credit amount,Duration,Housing_free,Housing_own,Housing_rent,Saving accounts_little,Saving accounts_moderate,Saving accounts_quite rich,...,Purpose_business,Purpose_car,Purpose_domestic appliances,Purpose_education,Purpose_furniture/equipment,Purpose_radio/TV,Purpose_repairs,Purpose_vacation/others,Risk_bad,Risk_good
0,22,2,5951,48,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,45,2,7882,42,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,53,2,4870,24,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,35,3,6948,36,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,28,3,5234,30,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


### Creating the Features and Target Sets

In [20]:
# Creating the features set X
X = encoded_df.drop(columns=["Risk_bad", "Risk_good"])

# Display sample data
X.head()

Unnamed: 0,Age,Job,Credit amount,Duration,Housing_free,Housing_own,Housing_rent,Saving accounts_little,Saving accounts_moderate,Saving accounts_quite rich,...,Checking account_moderate,Checking account_rich,Purpose_business,Purpose_car,Purpose_domestic appliances,Purpose_education,Purpose_furniture/equipment,Purpose_radio/TV,Purpose_repairs,Purpose_vacation/others
0,22,2,5951,48,0.0,1.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,45,2,7882,42,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,53,2,4870,24,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,35,3,6948,36,0.0,0.0,1.0,1.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,28,3,5234,30,0.0,1.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
# Creating the target set y
y = encoded_df["Risk_bad"]

# Display sample data
y.head()

0    1.0
1    0.0
2    1.0
3    0.0
4    1.0
Name: Risk_bad, dtype: float64

### Split the features and target sets into training and testing datasets

In [22]:
# Split the preprocessed data into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

### Use the Scikit-Learn’s StandardScaler to scale the features data

In [23]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Creating a Machine Learning Model in SageMaker Studio

### Importing the Required Libraries

In [29]:
# Import Amazon SageMaker libraries and modules
import sagemaker
import sagemaker.amazon.common as smac
from sagemaker import get_execution_role
from sagemaker.predictor import csv_serializer, json_deserializer

# Import AWS Python SDK
import boto3

# Import support libraries
import io
import os
import json
import numpy as np

### Configuring General Settings for the SageMaker Model

In [36]:
# Set the S3 bucket name
bucket = 'fintech-bootcamp-activities-jr-2023-01-04'


In [37]:
# Set a prefix for the data files
prefix = 'credit_risk'

In [38]:
# Set the IAM execution role
role = get_execution_role()

### Uploading the Training and Testing Data to Amazon S3

#### Encode and upload the training data

In [39]:
# Encode the training data as Protocol Buffer
buf = io.BytesIO()
vectors = np.array(X_train).astype("float32")
labels = np.array(y_train).astype("float32")
smac.write_numpy_to_dense_tensor(buf, vectors, labels)
buf.seek(0)

# Upload encoded training data to Amazon S3
key = 'linear_train.data'
boto3.resource("s3").Bucket(bucket).Object(os.path.join(prefix, "train", key)).upload_fileobj(buf)
s3_train_data = "s3://{}/{}/train/{}".format(bucket, prefix, key)
print("Training data uploaded to: {}".format(s3_train_data))

Training data uploaded to: s3://fintech-bootcamp-activities-jr-2023-01-04/credit_risk/train/linear_train.data


#### Encode and upload the testing data

In [40]:
# Encode the testing data as Protocol Buffer
buf = io.BytesIO()
vectors = np.array(X_test).astype("float32")
labels = np.array(y_test).astype("float32")
smac.write_numpy_to_dense_tensor(buf, vectors, labels)
buf.seek(0)

# Upload encoded testing data to Amazon S3
key = "linear_test.data"
boto3.resource("s3").Bucket(bucket).Object(os.path.join(prefix, "test", key)).upload_fileobj(buf)
s3_test_data = "s3://{}/{}/test/{}".format(bucket, prefix, key)
print("Testing data uploaded to: {}".format(s3_test_data))

Testing data uploaded to: s3://fintech-bootcamp-activities-jr-2023-01-04/credit_risk/test/linear_test.data


### Specify the Amazon SageMaker Session to Use

In [42]:
# Save the current session in a variable
sess = sagemaker.Session()

### Create an Instance of the Machine Learning Model

In [43]:
# Import the get_image_uri module from the sagemaker library
from sagemaker.amazon.amazon_estimator import get_image_uri

In [44]:
# Import the container image
container = get_image_uri(boto3.Session().region_name, 'linear-learner')

The method get_image_uri has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [50]:
# Create an instance of the machine learning model
linear_learner = sagemaker.estimator.Estimator(
    container,
    role,
    train_instance_count = 1,
    train_instance_type = 'ml.m4.xlarge',
    output_path = 's3://{}/{}/output'.format(bucket,prefix),
    sagemaker_session = sess
)

train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


### Define Linear Learner Hyperparameters

In [51]:
# Get the dimension of the feature-input vector
feature_dim = X.shape[1]

In [54]:
# Define linear learner hyperparameters
# Note how in this case we use: predictor_type='binary_classifier' # (credit risk: good or bad)
linear_learner.set_hyperparameters(
    feature_dim = feature_dim,
    mini_batch_size = 200,
    predictor_type = 'binary_classifier'
)

## Fitting a Machine Learning Model in SageMaker Studio

In [55]:
# Fitting the linear learner model
linear_learner.fit({
    'train': s3_train_data,
    'test': s3_test_data
})

2023-01-31 02:41:32 Starting - Starting the training job...
2023-01-31 02:42:00 Starting - Preparing the instances for trainingProfilerReport-1675132892: InProgress
.........
2023-01-31 02:43:17 Downloading - Downloading input data...
2023-01-31 02:43:58 Training - Downloading the training image......
2023-01-31 02:44:58 Training - Training image download completed. Training in progress...[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[01/31/2023 02:45:08 INFO 140670575187776] Reading default configuration from /opt/amazon/lib/python3.7/site-packages/algorithm/resources/default-input.json: {'mini_batch_size': '1000', 'epochs': '15', 'feature_dim': 'auto', 'use_bias': 'true', 'binary_classifier_model_selection_criteria': 'accuracy', 'f_beta': '1.0', 'target_recall': '0.8', 'target_precision': '0.8', 'num_models': 'auto', 'num_calibration_samples': '10000000', 'init_method': 'uniform', 'init_scale': '0.07', 'init_s

## Making Predictions With the Model in SageMaker Studio

### Deploying the Model

In [56]:
# Deploy an instance of the linear learner model to create a predictor
linear_predictor = linear_learner.deploy(initial_instance_count=1, instance_type='ml.t2.medium')

--------------------!

### Setting Configurations for the Predictor

In [57]:
# Linear predictor configurations
linear_predictor.serializer = csv_serializer
linear_predictor.deserializer = json_deserializer

### Making Predictions Using Testing Data

In [58]:
# Making some predictions using the test data
model_predictions = linear_predictor.predict(X_test_scaled)

The csv_serializer has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
The json_deserializer has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [59]:
# Display sample predictions
model_predictions['predictions'][:3]

[{'score': 0.3425270617008209, 'predicted_label': 0},
 {'score': 0.20269660651683807, 'predicted_label': 0},
 {'score': 0.006089568138122559, 'predicted_label': 0}]

In [60]:
# Create a list with the predicted values
y_predictions = [ np.uint8(i['predicted_label']) for i in model_predictions['predictions'] ]
# Transforming the list into an array
y_predictions = np.array(y_predictions)
# Display sample data
y_predictions[:10]

array([0, 0, 0, 0, 0, 1, 1, 0, 0, 0], dtype=uint8)

## Evaluating a Machine Model in SageMaker Studio

In [61]:
# Import the classification report from Scikit-learn
from sklearn.metrics import classification_report

In [62]:
# Display classification report
print("Classification report")
print(classification_report(y_test, y_predictions))

Classification report
              precision    recall  f1-score   support

         0.0       0.56      0.62      0.59        78
         1.0       0.35      0.30      0.32        53

    accuracy                           0.49       131
   macro avg       0.46      0.46      0.46       131
weighted avg       0.48      0.49      0.48       131



## Delete the End-Point to Avoid Additional AWS Resources Usage and Billing

Make sure that you delete all the Amazon SageMaker endpoints to prevent unwanted charges.

In [63]:
# Delete Amazon SageMaker end-point
sagemaker.Session().delete_endpoint(linear_predictor.endpoint)

The endpoint attribute has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
