# Predicting whether an order should be sent to a technical approver

For updates on the way Sagemaker or AWS behave compared to the notebook code, please refer to https://livebook.manning.com/#!/book/machine-learning-for-business/chapter-2/v-5/67

2022/2: Update notebook to SageMaker python sdk v2, author Su Wei


## Part 1: Load and examine the data

In [2]:
data_bucket = '' #we should use sagemaker default s3 bucket
subfolder = 'ch02'
dataset = 'orders_with_predicted_value.csv'

In [None]:
import pandas as pd
from time import sleep

import boto3
import sagemaker
from sklearn.model_selection import train_test_split

role = sagemaker.get_execution_role()
sess = sagemaker.Session()
data_bucket=sess.default_bucket()

print(f'SageMaker exectuion role: {role},  default s3 bucket: {data_bucket}')

In [6]:
df = pd.read_csv(f'./{dataset}')
df.head()

Unnamed: 0,tech_approval_required,requester_id,role,product,quantity,price,total
0,0,E2300,tech,Desk,1,664,664
1,0,E2300,tech,Keyboard,9,649,5841
2,0,E2374,non-tech,Keyboard,1,821,821
3,1,E2374,non-tech,Desktop Computer,24,655,15720
4,0,E2327,non-tech,Desk,1,758,758


In [7]:
print(f'Number of rows in dataset: {df.shape[0]}')
print(df[df.columns[0]].value_counts())

Number of rows in dataset: 1000
0    807
1    193
Name: tech_approval_required, dtype: int64


## Part 2: Get the data into the right shape

In [8]:
encoded_data = pd.get_dummies(df)
encoded_data.head()

Unnamed: 0,tech_approval_required,quantity,price,total,requester_id_E2300,requester_id_E2301,requester_id_E2302,requester_id_E2303,requester_id_E2304,requester_id_E2306,...,requester_id_E2400,role_non-tech,role_tech,product_Chair,product_Cleaning,product_Desk,product_Desktop Computer,product_Keyboard,product_Laptop Computer,product_Mouse
0,0,1,664,664,1,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
1,0,9,649,5841,1,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
2,0,1,821,821,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
3,1,24,655,15720,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0
4,0,1,758,758,0,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0


In [9]:
corrs = encoded_data.corr()['tech_approval_required'].abs()
columns = corrs[corrs > .1].index
corrs = corrs.filter(columns)
corrs

tech_approval_required      1.000000
role_non-tech               0.122454
role_tech                   0.122454
product_Chair               0.134168
product_Cleaning            0.191539
product_Desk                0.292137
product_Desktop Computer    0.752144
product_Keyboard            0.242224
product_Laptop Computer     0.516693
product_Mouse               0.190708
Name: tech_approval_required, dtype: float64

In [10]:
encoded_data = encoded_data[columns]
encoded_data.head()

Unnamed: 0,tech_approval_required,role_non-tech,role_tech,product_Chair,product_Cleaning,product_Desk,product_Desktop Computer,product_Keyboard,product_Laptop Computer,product_Mouse
0,0,0,1,0,0,1,0,0,0,0
1,0,0,1,0,0,0,0,1,0,0
2,0,1,0,0,0,0,0,1,0,0
3,1,1,0,0,0,0,1,0,0,0
4,0,1,0,0,0,1,0,0,0,0


## Part 3: Create training, validation and test data sets

In [11]:
#split data to train, validation, test
train_df, val_and_test_data = train_test_split(encoded_data, test_size=0.3, random_state=0)
val_df, test_df = train_test_split(val_and_test_data, test_size=0.333, random_state=0)

train_data = train_df.to_csv(f's3://{data_bucket}/{subfolder}/processed/train.csv', header=False, index=False)
val_data = val_df.to_csv(f's3://{data_bucket}/{subfolder}/processed/val.csv', header=False, index=False)
test_data = test_df.to_csv(f's3://{data_bucket}/{subfolder}/processed/test.csv', header=True, index=False)

#create train_input, val_input with TrainningInput wrapper 
train_input = sagemaker.inputs.TrainingInput(s3_data=f's3://{data_bucket}/{subfolder}/processed/train.csv', content_type='csv')
val_input = sagemaker.inputs.TrainingInput(s3_data=f's3://{data_bucket}/{subfolder}/processed/val.csv', content_type='csv')    

## Part 4: Train the model

In [13]:
#https://sagemaker.readthedocs.io/en/stable/v2.html sdk v1 and v2 diffrent
sess = sagemaker.Session()

container = sagemaker.image_uris.retrieve(
             region='us-east-1',
             framework='xgboost',
             version='latest'
             )


estimator = sagemaker.estimator.Estimator(
                container,
                role,
                instance_count=1, 
                instance_type='ml.m5.xlarge',
                output_path=f's3://{data_bucket}/{subfolder}/output',
                sagemaker_session=sess)

estimator.set_hyperparameters(
                max_depth=5,
                subsample=0.7,
                objective='binary:logistic',
                eval_metric = 'auc',
                num_round=100,
                early_stopping_rounds=10)

estimator.fit({'train': train_input, 'validation': val_input})

2022-02-08 02:10:03 Starting - Starting the training job...
2022-02-08 02:10:28 Starting - Launching requested ML instancesProfilerReport-1644286202: InProgress
......
2022-02-08 02:11:29 Starting - Preparing the instances for training......
2022-02-08 02:12:35 Downloading - Downloading input data
2022-02-08 02:12:35 Training - Downloading the training image...
2022-02-08 02:13:01 Uploading - Uploading generated training model[34mArguments: train[0m
[34m[2022-02-08:02:12:56:INFO] Running standalone xgboost training.[0m
[34m[2022-02-08:02:12:56:INFO] File size need to be processed in the node: 0.02mb. Available memory size in the node: 8010.27mb[0m
[34m[2022-02-08:02:12:56:INFO] Determined delimiter of CSV input is ','[0m
[34m[02:12:56] S3DistributionType set as FullyReplicated[0m
[34m[02:12:56] 700x9 matrix with 6300 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2022-02-08:02:12:56:INFO] Determined delimiter of CSV input is ','

## Part 5: Host the model

In [14]:
endpoint_name = 'order-approval'
try:
    predictor = sagemaker.predictor.Predictor(endpoint_name=endpoint_name)
    predictor.endpoint_context()
    sess.delete_endpoint(endpoint_name)
    print('Warning: Existing endpoint deleted to make way for your new endpoint.')
    print('Waiting 10 seconds ...')
    sleep(10)
except:
    print(f'endpoint: {endpoint_name} not exits')
    pass 



Waiting 10 seconds ...


In [15]:
#https://stackoverflow.com/questions/63568274/how-to-use-serializer-and-deserializer-in-sagemaker-2
#you need init JSONDeserializer, CSVSerializer, etc. CSVSerializer(),JSONDeserializer()

from sagemaker.deserializers import JSONDeserializer
from sagemaker.serializers import CSVSerializer


predictor = estimator.deploy(initial_instance_count=1,
               instance_type='ml.m5.large', 
               endpoint_name=endpoint_name,
               serializer=CSVSerializer(),
               deserializer=JSONDeserializer()
            )

-----!

## Part 6: Test the model

In [19]:
def get_prediction(row):
    prediction = round(float(predictor.predict(row[1:],initial_args={'ContentType': 'text/csv'})))
    return prediction

test_data=pd.read_csv(f's3://{data_bucket}/{subfolder}/processed/test.csv')
cols = list(test_data.columns)
test_data['prediction'] = test_data.apply(get_prediction, axis=1)
test_data = test_data[['prediction'] + cols]
test_data[:10]

Unnamed: 0,prediction,tech_approval_required,role_non-tech,role_tech,product_Chair,product_Cleaning,product_Desk,product_Desktop Computer,product_Keyboard,product_Laptop Computer,product_Mouse
0,0,0,1,0,0,0,1,0,0,0,0
1,0,0,1,0,0,0,0,0,0,0,1
2,0,0,0,1,0,0,1,0,0,0,0
3,0,0,1,0,0,0,1,0,0,0,0
4,0,0,1,0,0,0,1,0,0,0,0
5,0,0,1,0,0,1,0,0,0,0,0
6,0,0,1,0,0,0,1,0,0,0,0
7,0,0,1,0,0,0,1,0,0,0,0
8,0,0,1,0,0,1,0,0,0,0,0
9,0,0,1,0,0,0,0,0,0,0,1


In [20]:
(test_data['prediction'] == test_data['tech_approval_required']).mean()

0.99

## Remove the Endpoint (optional)
Comment out this cell to remove the endpoint if you want the endpoint to exist after "run all"

In [21]:
sess.delete_endpoint(endpoint_name)