# IMDB Sentiment Classifier
### Using Hugging Face with the SageMaker SDK

---
# Uh Oh

If we're here, then something went wrong and we're using an existing real time inference endpoint.

In [None]:
%%capture

import boto3
import botocore
import sagemaker
import sagemaker.huggingface
from sagemaker.predictor import Predictor
from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import JSONDeserializer

session = sagemaker.Session()
bucket = session.default_bucket()
role = sagemaker.get_execution_role()

predictor = Predictor(
    endpoint_name = "imdb-huggingface-2021-05-17-18-17-17-517",
    sagemaker_session = session, 
    serializer = JSONSerializer(), 
    deserializer = JSONDeserializer()
)

### Make Inferences Using a SageMaker Predictor

In [None]:
import json
import pandas

inputs = [
    "Willow is the greatest movie that ever lived.",
    "The Notebook is ironically depressing.",
    "It's annoying that I had to Google the capitalization of 'Back to the Future', but it is a gem of nostalgic wonder.",
    "Yikes! Weird Science did not age well for 2021.",
    "Love and Monsters made me cry happy tears."
]

results = []
for it in inputs:
    inp = {"text": it}
    prediction = predictor.predict(inp)
    results.append({
        **inp,
        **prediction
    })
    
df = pandas.DataFrame(results)
df.head()

---
# Load a Pre-Trained Model from S3

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import tarfile

model_s3_path = "s3://sagemaker-us-east-1-934284400219/imdb-huggingface-2021-05-17-18-17-17-517/model.tar.gz"

sagemaker.s3.S3Downloader.download(model_s3_path, "models")

with tarfile.open("models/model.tar.gz") as f:
    f.extractall(path = "models/")
    f.close()

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("./models")

In [None]:
def predict(input_text):
    CLASS_NAMES = ["NEGATIVE", "POSITIVE"]
    tokenized = tokenizer(
        input_text,
        add_special_tokens = True,
        return_token_type_ids = False,
        return_attention_mask = True,
        padding = "max_length",
        truncation = True,
        return_tensors = "pt"
    )
    output = model(tokenized["input_ids"], tokenized["attention_mask"])
    values, indices = torch.max(output.logits, dim = 1)
    normalized = torch.softmax(output.logits, dim = 1)
    index = indices.item()
    confidence = normalized[0][index].item()
    return {
        "text": input_text,
        "sentiment": CLASS_NAMES[index],
        "confidence": confidence
    }

df = pandas.DataFrame([predict(it) for it in inputs])
df.head()