# IMDB Sentiment Classifier
### Using Hugging Face with the SageMaker SDK

# What We're Going To Do:

#### Installation
1. Install the SageMaker SDK and the Hugging Face libraries
1. Start a SageMaker session, including the default IAM role and S3 bucket
    
#### Data Preparation
1. Tokenization: Download and prepare our IMDB dataset for NLP model training
1. Upload our tokenized and split dataset to S3

#### Model Training
1. Setup an Estimator
1. Prepare the model for deployment

#### Realtime Inference
1. Deploy the model
1. Make inferences with a Predictor

#### Clean Up

## Installation

In [None]:
%%capture

!pip install --upgrade "sagemaker>=2.31.0" "transformers==4.4.2" "datasets[s3]==1.5.0"
!conda install -c conda-forge ipywidgets -y

import IPython
IPython.Application.instance().kernel.do_shutdown(True)

import sagemaker.huggingface

In [1]:
%%capture

import boto3
import sagemaker

session = sagemaker.Session()
bucket = session.default_bucket()
role = sagemaker.get_execution_role()

In [2]:
print(f"SageMaker role arn: {role}")
print(f"SageMaker bucket: {session.default_bucket()}")
print(f"SageMaker session region: {session.boto_region_name}")

SageMaker role arn: arn:aws:iam::061635907654:role/service-role/AmazonSageMaker-ExecutionRole-20201221T131849
SageMaker bucket: sagemaker-us-east-1-061635907654
SageMaker session region: us-east-1


# Data Preparation

## Download and Split the Dataset 

In [3]:
%%capture

import pandas
import datasets
from transformers import AutoTokenizer
from sagemaker_demo_helper import SageMakerDemoHelper

model_name = "distilbert-base-uncased"

helper = SageMakerDemoHelper.instance(bucket, role, model_name)

tokenizer = AutoTokenizer.from_pretrained(model_name)

train_dataset, test_dataset = datasets.load_dataset(
    "imdb", 
    ignore_verifications = True,
    split = ["train", "test"]
)

# Tokenize

In [4]:
tokenize = lambda batch: tokenizer(batch["text"], padding = "max_length", truncation = True)
# test_ds = test_dataset.shuffle().select(range(10000))

train_ds = train_dataset.shuffle().map(tokenize)
test_ds = test_dataset.shuffle().map(tokenize)

try:
    train_ds = train_ds.rename_column("label", "labels")
    test_ds = test_ds.rename_column("label", "labels")
except:
    pass

pandas.DataFrame(train_ds[0:5])

Loading cached shuffled indices for dataset at /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/90099cb476936b753383ba2ae6ab2eae419b2e87f71cd5189cb9c8e5814d12a3/cache-9849c7513e7c2228.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/90099cb476936b753383ba2ae6ab2eae419b2e87f71cd5189cb9c8e5814d12a3/cache-2710b701f1525e3f.arrow


HBox(children=(FloatProgress(value=0.0, max=25000.0), HTML(value='')))




Unnamed: 0,attention_mask,input_ids,labels,text
0,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 2096, 1996, 2434, 4673, 2544, 1010, 2007...",1,"While the original 1932 version, with Preston ..."
1,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 18527, 1996, 2087, 5720, 1011, 2055, 213...",1,Ironically the most talked-about American film...
2,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 27137, 2003, 1996, 2208, 1010, 14636, 20...",0,"Bingo is the game, bullshit is the name. Rarel..."
3,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 2045, 2024, 2061, 2116, 4436, 2004, 2000...",1,There are so many reasons as to why I rate the...
4,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 1045, 2245, 3666, 6107, 6876, 2006, 5971...",0,I thought watching employment videos on corpor...


In [5]:
columns = ["input_ids", "attention_mask", "labels"]
train_ds.set_format("torch", columns = columns)
test_ds.set_format("torch", columns = columns)

## Uploading the Dataset to S3

In [6]:
import botocore
from datasets.filesystems import S3FileSystem

s3 = S3FileSystem()

s3_prefix = "datasets/imdb-binary-classification"

training_input_path = f"s3://{bucket}/{s3_prefix}/train"
train_ds.save_to_disk(training_input_path, fs = s3)

test_input_path = f"s3://{bucket}/{s3_prefix}/test"
test_ds.save_to_disk(test_input_path, fs = s3)

# Model Training

# Setup an Estimator

In [8]:
estimator = helper.use_standard_training()

inputs = { 
    "train": training_input_path, 
    "test": test_input_path
}

estimator.fit(
    inputs,
    wait = False
)

<sagemaker.huggingface.estimator.HuggingFace object at 0x7f9080198748>


# Prepare the Model for Deployment

In [11]:
from sagemaker import TrainingJobAnalytics

df = TrainingJobAnalytics(training_job_name = estimator.latest_training_job.name).dataframe()
display(df[["metric_name", "value"]].groupby("metric_name").max())

KeyError: 'TrainingStartTime'

In [None]:
import time
from sagemaker.utils import name_from_base
from sagemaker.pytorch import PyTorchModel
from sagemaker.predictor import Predictor
from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import JSONDeserializer

class SentimentAnalysis(Predictor):
    def __init__(self, endpoint_name, sagemaker_session):
        super().__init__(
            endpoint_name, 
            sagemaker_session = sagemaker_session, 
            serializer = JSONSerializer(), 
            deserializer = JSONDeserializer()
        )

name = name_from_base("imdb-huggingface")

model = PyTorchModel(
    name = name,
    role = role, 
    model_data = estimator.model_data,
    source_dir = "./scripts",
    entry_point = "torchserve-predictor.py",
    framework_version = "1.6.0",
    py_version = "py36",
    predictor_cls = SentimentAnalysis
)


# Deploy the Model

In [None]:
predictor = model.deploy(
    initial_instance_count = 1,
    instance_type = "ml.m5.large",
    endpoint_name = name,
    wait = True
)

# Make Inferences
## Using a SageMaker Predictor

In [None]:
import json

inputs = [
    "Willow is the greatest movie that has ever lived.",
    "The Notebook is ironically depressing.",
    "My cat's breath smells like cat food."
]

prediction = predictor.predict({"text": it})
print(f'    {prediction}: {it}')

# Clean Up

In [None]:
try:
    for i in range(len(models)):
        predictors[i].delete_endpoint()
        models[i].delete_model()
except:
    display("Already deleted")