# Batch Transform

In [1]:
!pip install -U pandas scikit-learn sagemaker

You should consider upgrading via the '/opt/conda/bin/python -m pip install --upgrade pip' command.[0m[33m
[0m

## Create the inference script
- Similar to real time inference, first we need to write the 4 functions for model inference in a .py script
- Sagemaker API documentation: https://sagemaker.readthedocs.io/en/stable/api/index.html

In [2]:
%%writefile batch_transform.py

import os
import joblib
import pandas as pd

# Load the Model
def model_fn(model_dir):
    model_file_name = "pipeline_model.joblib"
    pipeline_model = joblib.load(os.path.join(model_dir, model_file_name))
    
    return pipeline_model

# Load the input data
def input_fn(request_body, request_content_type):
    """An input_fn that loads a pickled numpy array"""
    if request_content_type == "application/json":
        input_object = pd.read_json(request_body, lines=True)
        
        return input_object
    else:
        raise ValueError("Only application/json content type supported!")

def predict_fn(input_object, pipeline_model):
    predictions = pipeline_model.predict(input_object)
    pred_probs = pipeline_model.predict_proba(input_object)
    
    prediction_object = pd.DataFrame(
        {
            "prediction": predictions.tolist(),
            "pred_prob_class0": pred_probs[:, 0].tolist(),
            "pred_prob_class1": pred_probs[:, 1].tolist()
        }
    )
    
    return prediction_object

def output_fn(prediction_object, request_content_type):
    return_object = prediction_object.to_json(orient="records", lines=True)
    
    return return_object

Overwriting batch_transform.py


In [3]:
%%writefile requirements.txt
pandas
numpy

Overwriting requirements.txt


## Trigger Batch Transfrom Job

In [4]:
# Create the deployment
from sagemaker.sklearn.model import SKLearnModel
from sagemaker import Session, get_execution_role

training_job_name = "knn-pipeline-tuner-220611-0916-006-77833ec6"
model_artifact = f"s3://sagemaker-us-east-1-298138509966/{training_job_name}/output/model.tar.gz"
endpoint_name = "heart-disease-knn-pipeline-model"

base_model = SKLearnModel(
    name=endpoint_name,
    framework_version="1.0-1",
    entry_point="batch_transform.py",
    dependencies=["requirements.txt"],
    model_data=model_artifact,
    role=get_execution_role(),
    sagemaker_session = Session()
)

In [12]:
# NEW! Create a batch transformer from the base model
output_path = "s3://sagemaker-us-east-1-298138509966/sagemaker/heart_disease/test_preds"
batch_transformer = base_model.transformer(instance_count=2, 
                                           instance_type="ml.m5.large",
                                           strategy="MultiRecord",
                                           accept="application/json",
                                           assemble_with="Line", 
                                           output_path=output_path)

Using already existing model: heart-disease-knn-pipeline-model


In [13]:
%%time
# Feed the test data
test_data_path = "s3://sagemaker-us-east-1-298138509966/sagemaker/heart_disease/bigtest.json"
batch_transformer.transform(test_data_path, content_type="application/json", split_type="Line")

.............................[34m2022-06-12 06:02:17,615 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)[0m
[34m2022-06-12 06:02:17,618 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)[0m
[34m2022-06-12 06:02:17,618 INFO - sagemaker-containers - nginx config: [0m
[34mworker_processes auto;[0m
[34mdaemon off;[0m
[34mpid /tmp/nginx.pid;[0m
[34merror_log  /dev/stderr;[0m
[34mworker_rlimit_nofile 4096;[0m
[34mevents {
  worker_connections 2048;[0m
[34m}[0m
[34mhttp {
  include /etc/nginx/mime.types;
  default_type application/octet-stream;
  access_log /dev/stdout combined;
  upstream gunicorn {
    server unix:/tmp/gunicorn.sock;
  }
  server {
    listen 8080 deferred;
    client_max_body_size 0;
    keepalive_timeout 3;
    location ~ ^/(ping|invocations|execution-parameters) {
      proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
      proxy_set_header Host $http_host;
      proxy_redirect off;
 

In [7]:
# Print the output path
output_path = f"{batch_transformer.output_path}/bigtest.json.out"
print("Output written to: ")
print(f"{output_path}")

Output written to: 
s3://sagemaker-us-east-1-298138509966/sagemaker/heart_disease/test_preds/bigtest.json.out


## Analyse the predictions

In [14]:
import pandas as pd
output_path = "s3://sagemaker-us-east-1-298138509966/sagemaker/heart_disease/test_preds/bigtest.json.out"
preds_df = pd.read_json(output_path, lines=True)

print(preds_df.shape)
preds_df.head()

(500000, 3)


Unnamed: 0,prediction,pred_prob_class0,pred_prob_class1
0,1,0.333333,0.666667
1,1,0.333333,0.666667
2,0,1.0,0.0
3,1,0.333333,0.666667
4,0,1.0,0.0


In [9]:
# Join predictions to input
bigtest = "../data/bigtest.json"
bigtest_df = pd.read_json(bigtest, lines=True)

bigtest_df = bigtest_df.join(preds_df)
bigtest_df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target,prediction,pred_prob_class0,pred_prob_class1
0,51,1,3,110,175,0,0,123,0,0.6,1,0,3,0,1,0.333333,0.666667
1,53,1,4,140,203,1,2,155,1,3.1,3,0,7,1,1,0.333333,0.666667
2,35,1,2,122,192,0,0,174,0,0.0,1,0,3,0,0,1.0,0.0
3,53,1,4,140,203,1,2,155,1,3.1,3,0,7,1,1,0.333333,0.666667
4,41,1,2,110,235,0,0,153,0,0.0,1,0,3,0,0,1.0,0.0


In [15]:
# Calculate test accuracy
len(bigtest_df[bigtest_df["target"]==bigtest_df["prediction"]])/len(bigtest_df)

0.867096