# IMDB Sentiment Classifier
### Using Hugging Face with the SageMaker SDK

# What We're Going To Do:

#### Installation
1. Install the SageMaker SDK and the Hugging Face libraries
1. Start a SageMaker session, including the default IAM role and S3 bucket
    
#### Data Preparation
1. Tokenization: Download and prepare our IMDB dataset for NLP model training
1. Upload our tokenized and split dataset to S3

#### Model Training
1. Setup an Estimator
1. Prepare the model for deployment

#### Realtime Inference
1. Deploy the model
1. Make inferences with a Predictor

#### Clean Up

## Installation

In [None]:
%%capture

!pip install --upgrade "sagemaker>=2.31.0" "transformers==4.4.2" "datasets[s3]==1.5.0"
!conda install -c conda-forge ipywidgets -y

import IPython
IPython.Application.instance().kernel.do_shutdown(True)

import sagemaker.huggingface

In [1]:
%%capture

import boto3
import botocore
import sagemaker

session = sagemaker.Session()
bucket = session.default_bucket()
role = sagemaker.get_execution_role()

In [2]:
print(f"SageMaker role arn: {role}")
print(f"SageMaker bucket: {session.default_bucket()}")
print(f"SageMaker session region: {session.boto_region_name}")

SageMaker role arn: arn:aws:iam::061635907654:role/service-role/AmazonSageMaker-ExecutionRole-20201221T131849
SageMaker bucket: sagemaker-us-east-1-061635907654
SageMaker session region: us-east-1


# Data Preparation

## Download and Split the Dataset 

In [3]:
import importlib
import pandas
import datasets
from transformers import AutoTokenizer
from sagemaker_demo_helper import SageMakerDemoHelper

datasets.logging.set_verbosity_error()

model_name = "distilbert-base-uncased"

helper = SageMakerDemoHelper.instance(bucket, role, model_name)

tokenizer = AutoTokenizer.from_pretrained(model_name)

train_dataset, test_dataset = datasets.load_dataset(
    "imdb", 
    ignore_verifications = True,
    split = ["train", "test"]
)

# Tokenize

In [4]:
datasets.logging.set_verbosity_info()

tokenize = lambda batch: tokenizer(batch["text"], padding = "max_length", truncation = True)
# test_ds = test_dataset.shuffle().select(range(10000))

train_ds = train_dataset.shuffle().map(tokenize)
test_ds = test_dataset.shuffle().map(tokenize)

try:
    train_ds = train_ds.rename_column("label", "labels")
    test_ds = test_ds.rename_column("label", "labels")
except:
    pass

columns = ["input_ids", "attention_mask", "labels"]
train_ds.set_format("torch", columns = columns)
test_ds.set_format("torch", columns = columns)

Loading cached shuffled indices for dataset at /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/90099cb476936b753383ba2ae6ab2eae419b2e87f71cd5189cb9c8e5814d12a3/cache-9849c7513e7c2228.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/90099cb476936b753383ba2ae6ab2eae419b2e87f71cd5189cb9c8e5814d12a3/cache-2710b701f1525e3f.arrow


HBox(children=(FloatProgress(value=0.0, max=25000.0), HTML(value='')))




In [5]:
pandas.DataFrame(train_ds[0:5])

100%|██████████| 3/3 [00:00<00:00, 5696.20it/s]


Unnamed: 0,attention_mask,input_ids,labels
0,"(tensor(1), tensor(1), tensor(1), tensor(1), t...","(tensor(101), tensor(2096), tensor(1996), tens...",1
1,"(tensor(1), tensor(1), tensor(1), tensor(1), t...","(tensor(101), tensor(18527), tensor(1996), ten...",1
2,"(tensor(1), tensor(1), tensor(1), tensor(1), t...","(tensor(101), tensor(27137), tensor(2003), ten...",0
3,"(tensor(1), tensor(1), tensor(1), tensor(1), t...","(tensor(101), tensor(2045), tensor(2024), tens...",1
4,"(tensor(1), tensor(1), tensor(1), tensor(1), t...","(tensor(101), tensor(1045), tensor(2245), tens...",0


## Uploading the Dataset to S3

In [6]:
from datasets.filesystems import S3FileSystem

s3 = S3FileSystem()

s3_prefix = "datasets/imdb-binary-classification"
training_input_path = f"s3://{bucket}/{s3_prefix}/train"
test_input_path = f"s3://{bucket}/{s3_prefix}/test"

train_ds.save_to_disk(training_input_path, fs = s3)
test_ds.save_to_disk(test_input_path, fs = s3)

# Model Training

# Setup an Estimator

In [None]:
estimator = helper.use_spot_distributed()

inputs = { 
    "train": training_input_path, 
    "test": test_input_path
}

estimator.fit(inputs, wait = True)

2021-05-07 16:44:07 Starting - Starting the training job...
2021-05-07 16:44:34 Starting - Launching requested ML instancesProfilerReport-1620405846: InProgress
............
2021-05-07 16:46:35 Starting - Preparing the instances for training.........
2021-05-07 16:48:08 Downloading - Downloading input data...
2021-05-07 16:48:35 Training - Downloading the training image..............[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2021-05-07 16:50:54,872 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2021-05-07 16:50:54,951 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m

2021-05-07 16:50:56 Training - Training image download completed. Training in progress.[35mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[35mbash: no job control in this shell[0m
[35m2021-05-07 16:5

# Prepare the Model for Deployment

In [None]:
from sagemaker import TrainingJobAnalytics

df = TrainingJobAnalytics(training_job_name = estimator.latest_training_job.name).dataframe()
display(df[["metric_name", "value"]].groupby("metric_name").max())

In [38]:
import time
from sagemaker.utils import name_from_base
from sagemaker.pytorch import PyTorchModel
from sagemaker.predictor import Predictor
from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import JSONDeserializer

class SentimentAnalysis(Predictor):
    def __init__(self, endpoint_name, sagemaker_session):
        super().__init__(
            endpoint_name, 
            sagemaker_session = sagemaker_session, 
            serializer = JSONSerializer(), 
            deserializer = JSONDeserializer()
        )

name = name_from_base("imdb-huggingface")

model = PyTorchModel(
    name = name,
    role = role, 
    model_data = estimator.model_data,
    source_dir = "./scripts",
    entry_point = "torchserve-predictor.py",
    framework_version = "1.6.0",
    py_version = "py36",
    predictor_cls = SentimentAnalysis
)


# Deploy the Model

In [39]:
predictor = model.deploy(
    initial_instance_count = 1,
    instance_type = "ml.m5.large",
    endpoint_name = name,
    wait = True
)

---------------!

# Make Inferences
## Using a SageMaker Predictor

In [50]:
import json

inputs = [
    "Willow is the greatest movie that ever lived.",
    "The Notebook is ironically depressing.",
    "It's annoying that I had to Google the capitalization of Back to the Future, but it is a gem of nostalgic wonder.",
    "Yikes! Weird Science did not age well for 2021."
]

for it in inputs:
    prediction = predictor.predict({"text": it})
    print(f'    {prediction}: {it}')

    POSITIVE: Willow is the greatest movie that ever lived.
    NEGATIVE: The Notebook is ironically depressing.
    POSITIVE: It's annoying that I had to Google the capitalization of Back to the Future, but it is a gem of nostalgic wonder.
    NEGATIVE: Yikes! Weird Science did not age well for 2021.


# Clean Up

In [51]:
try:
    predictor.delete_endpoint()
    model.delete_model()
except:
    display("Already deleted")