# IMDB Sentiment Classifier
### Using Hugging Face with the SageMaker SDK

# What We're Going To Do:

#### Installation
1. Install the SageMaker SDK and the Hugging Face libraries
1. Start a SageMaker session, including the default IAM role and S3 bucket
    
#### Data Preparation
1. Tokenization: Download and prepare our IMDB dataset for NLP model training
1. Upload our tokenized and split dataset to S3

#### Model Training
1. Setup an Estimator
1. Train a model

#### Realtime Inference
1. Prepare the model for deployment
1. Deploy the model and create a Predictor
1. Make inferences using a Predictor

#### Clean Up

---
# Installation

In [33]:
%%capture

import os

TRANSFORMERS_VERSION = "4.5.0"
DATASETS_VERSION = "1.6.2"

requirements_txt = """numpy
pandas
transformers=={0}
datasets=={1}
""".format(TRANSFORMERS_VERSION, DATASETS_VERSION)

with open(os.path.join(os.getcwd(), "scripts", "requirements.txt"), "w") as f:
    f.write(requirements_txt)

!pip install --upgrade "sagemaker>=2.31.0" "transformers==$TRANSFORMERS_VERSION" "datasets[s3]==$DATASETS_VERSION"
!conda install -c conda-forge ipywidgets -y

# import IPython
# IPython.Application.instance().kernel.do_shutdown(True)

In [34]:
%%capture

import boto3
import botocore
import sagemaker
import sagemaker.huggingface

session = sagemaker.Session()
bucket = session.default_bucket()
role = sagemaker.get_execution_role()

In [35]:
print(f"SageMaker role arn: {role}")
print(f"SageMaker bucket: {session.default_bucket()}")
print(f"SageMaker session region: {session.boto_region_name}")

SageMaker role arn: arn:aws:iam::934284400219:role/service-role/AmazonSageMaker-ExecutionRole-20210510T080103
SageMaker bucket: sagemaker-us-east-1-934284400219
SageMaker session region: us-east-1


---
# Data Preparation

### Download and Split the Dataset 

In [36]:
import importlib
import pandas
import datasets
from transformers import AutoTokenizer
from sagemaker_demo_helper import SageMakerDemoHelper

datasets.logging.set_verbosity_error()

model_name = "distilbert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_name)

train_dataset, test_dataset = datasets.load_dataset(
    "imdb", 
    ignore_verifications = True,
    split = ["train", "test"]
)

### Tokenize

In [37]:
datasets.logging.set_verbosity_error()

tokenize = lambda batch: tokenizer(batch["text"], padding = "max_length", truncation = True)
# test_ds = test_dataset.shuffle().select(range(10000))

train_ds = train_dataset.shuffle().map(tokenize)
test_ds = test_dataset.shuffle().map(tokenize)

try:
    train_ds = train_ds.rename_column("label", "labels")
    test_ds = test_ds.rename_column("label", "labels")
except:
    pass

columns = ["input_ids", "attention_mask", "labels"]
train_ds.set_format("torch", columns = columns)
test_ds.set_format("torch", columns = columns)

In [38]:
pandas.DataFrame(train_ds[0:5])

Unnamed: 0,attention_mask,input_ids,labels
0,"(tensor(1), tensor(1), tensor(1), tensor(1), t...","(tensor(101), tensor(1045), tensor(2245), tens...",0
1,"(tensor(1), tensor(1), tensor(1), tensor(1), t...","(tensor(101), tensor(1045), tensor(2253), tens...",0
2,"(tensor(1), tensor(1), tensor(1), tensor(1), t...","(tensor(101), tensor(1045), tensor(4771), tens...",0
3,"(tensor(1), tensor(1), tensor(1), tensor(1), t...","(tensor(101), tensor(10386), tensor(19558), te...",0
4,"(tensor(1), tensor(1), tensor(1), tensor(1), t...","(tensor(101), tensor(1996), tensor(2364), tens...",0


### Upload the Dataset to S3

In [39]:
from datasets.filesystems import S3FileSystem

s3 = S3FileSystem()

s3_prefix = "datasets/imdb-binary-classification"
training_input_path = f"s3://{bucket}/{s3_prefix}/train"
test_input_path = f"s3://{bucket}/{s3_prefix}/test"

train_ds.save_to_disk(training_input_path, fs = s3)
test_ds.save_to_disk(test_input_path, fs = s3)

---
# Model Training

### Setup an Estimator

In [40]:
from sagemaker.huggingface import HuggingFace

job_name = "imdb-huggingface"

metric_definitions = [
    { "Name": "loss", "Regex": "'loss': ([0-9]+(.|e\-)[0-9]+),?" },
    { "Name": "learning_rate", "Regex": "'learning_rate': ([0-9]+(.|e\-)[0-9]+),?" },
    { "Name": "eval_loss", "Regex": "'eval_loss': ([0-9]+(.|e\-)[0-9]+),?" },
    { "Name": "eval_accuracy", "Regex": "'eval_accuracy': ([0-9]+(.|e\-)[0-9]+),?" },
    { "Name": "eval_f1", "Regex": "'eval_f1': ([0-9]+(.|e\-)[0-9]+),?" },
    { "Name": "eval_precision", "Regex": "'eval_precision': ([0-9]+(.|e\-)[0-9]+),?" },
    { "Name": "eval_recall", "Regex": "'eval_recall': ([0-9]+(.|e\-)[0-9]+),?" },
    { "Name": "eval_runtime", "Regex": "'eval_runtime': ([0-9]+(.|e\-)[0-9]+),?" },
    { "Name": "eval_samples_per_second", "Regex": "'eval_samples_per_second': ([0-9]+(.|e\-)[0-9]+),?" },
    { "Name": "epoch", "Regex": "'epoch': ([0-9]+(.|e\-)[0-9]+),?" }
]

estimator = HuggingFace(
    base_job_name = job_name,
    role = role,
    py_version = "py36",
    pytorch_version = "1.6.0",
    transformers_version = TRANSFORMERS_VERSION,
    entry_point = "trainer.py",
    instance_count = 1,
    instance_type = "ml.p3.16xlarge",
    source_dir = "./scripts",
    enable_sagemaker_metrics = True,
    metric_definitions = metric_definitions,
    hyperparameters = {
        "epochs": 3,
        "eval_batch_size": 128,
        "model_name": model_name,
        "train_batch_size": 64
    }
)

### Train a Model using the Estimator

In [None]:
inputs = { 
    "train": training_input_path, 
    "test": test_input_path
}
estimator.fit(inputs, wait = True)

2021-05-11 14:25:05 Starting - Starting the training job...
2021-05-11 14:25:33 Starting - Launching requested ML instancesProfilerReport-1620743105: InProgress
............
2021-05-11 14:27:34 Starting - Preparing the instances for training.........
2021-05-11 14:29:05 Downloading - Downloading input data
2021-05-11 14:29:05 Training - Downloading the training image........

In [None]:
# from sagemaker import TrainingJobAnalytics
# df = TrainingJobAnalytics(training_job_name = estimator.latest_training_job.name).dataframe()
# display(df[["metric_name", "value"]].groupby("metric_name").max())

---
# Model Deployment

### Prepare the Model for Deployment

In [None]:
import time
from sagemaker.utils import name_from_base
from sagemaker.pytorch import PyTorchModel
from sagemaker.predictor import Predictor
from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import JSONDeserializer

class SentimentAnalysis(Predictor):
    def __init__(self, endpoint_name, sagemaker_session):
        super().__init__(
            endpoint_name, 
            sagemaker_session = sagemaker_session, 
            serializer = JSONSerializer(), 
            deserializer = JSONDeserializer()
        )

name = name_from_base(job_name)

model = PyTorchModel(
    name = name,
    role = role, 
    model_data = estimator.model_data,
    source_dir = "./scripts",
    entry_point = "predictor.py",
    framework_version = "1.6.0",
    py_version = "py36",
    predictor_cls = SentimentAnalysis
)

In [None]:
predictor = model.deploy(
    initial_instance_count = 1,
    instance_type = "ml.m5.large",
    endpoint_name = name,
    wait = True
)

### Make Inferences Using a SageMaker Predictor

In [None]:
import json

inputs = [
    "Willow is the greatest movie that ever lived.",
    "The Notebook is ironically depressing.",
    "It's annoying that I had to Google the capitalization of Back to the Future, but it is a gem of nostalgic wonder.",
    "Yikes! Weird Science did not age well for 2021."
]

for it in inputs:
    prediction = predictor.predict({"text": it})
    print(f'    {prediction}: {it}')

---
# Clean Up

In [None]:
try:
    predictor.delete_endpoint()
    model.delete_model()
except:
    display("Already deleted")