## Setup

In [None]:
%pip -q install torch boto3 sagemaker transformers datasets[s3] --user

In [None]:
import sagemaker

print(sagemaker.__version__)

sess = sagemaker.Session()
role = sagemaker.get_execution_role()

## Preprocessing

In [None]:
from datasets import load_dataset

train_dataset, test_dataset = load_dataset('imdb', split=['train', 'test'])

print(train_dataset.shape)
print(test_dataset.shape)

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

def tokenize(batch):
    return tokenizer(batch['text'], padding='max_length', truncation=True)

train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
test_dataset = test_dataset.map(tokenize, batched=True, batch_size=len(test_dataset))

In [None]:
train_dataset = train_dataset.rename_column('label', 'labels')
test_dataset = test_dataset.rename_column('label', 'labels')

## Upload data to S3

In [None]:
from datasets.filesystems import S3FileSystem

s3 = S3FileSystem()  

bucket = sess.default_bucket()
prefix = 'hugging-face/demo'

training_input_path = f's3://{bucket}/{prefix}/train'
train_dataset.save_to_disk(training_input_path,fs=s3)

test_input_path = f's3://{bucket}/{prefix}/test'
test_dataset.save_to_disk(test_input_path,fs=s3)

In [None]:
print(training_input_path)
print(test_input_path)

## Fine-tune the Hugging Face model on SageMaker

In [None]:
hyperparameters={
    'epochs': 1,
    'train_batch_size': 32,
    'model_name':'distilbert-base-uncased'
}

In [None]:
transformers_version='4.12.3'
pytorch_version='1.9.1'
py_version='py38'

In [None]:
import sagemaker.huggingface
from sagemaker.huggingface import HuggingFace

huggingface_estimator = HuggingFace(
    role=role,
    # Fine-tuning script
    entry_point='train.py',
    hyperparameters=hyperparameters,
    # Infrastructure
    transformers_version=transformers_version,
    pytorch_version=pytorch_version,
    py_version=py_version,
    instance_type='ml.p3.2xlarge',
    instance_count=1,
    # Managed Spot Training
    use_spot_instances=True,
    max_wait=3600,
    max_run=3600,
    # Disable profiling
    disable_profiler=True
)

In [None]:
huggingface_estimator.fit(
    {'train': training_input_path, 'test': test_input_path}
)

In [None]:
model_data_url = huggingface_estimator.model_data

### Use boto3 to deploy with serverless inference

https://docs.aws.amazon.com/sagemaker/latest/dg/serverless-endpoints.html

In [None]:
import boto3

sm = boto3.client(service_name='sagemaker')
sm_rt = boto3.client(service_name='sagemaker-runtime')

In [None]:
from time import gmtime, strftime

def name_with_timestamp(name):
    return '{}-{}'.format(name, strftime('%Y-%m-%d-%H-%M-%S', gmtime()))

In [None]:
huggingface_model_name    = name_with_timestamp('huggingface-serverless')
huggingface_epc_name      = name_with_timestamp('huggingface-serverless-epc')
huggingface_endpoint_name = name_with_timestamp('huggingface-serverless-ep')

### Create model

In [None]:
region = boto3.session.Session().region_name

image_uri = sagemaker.image_uris.retrieve(
    framework='huggingface',
    base_framework_version=f'pytorch{pytorch_version}',
    region=region,
    version=transformers_version,
    py_version=py_version,
    instance_type='ml.m5.large',   # No GPU support on serverless inference
    image_scope='inference'
)

image_uri

In [None]:
create_model_response = sm.create_model(
    ModelName=huggingface_model_name,
    Containers=[
        {
            'Image': image_uri,
            'Mode': 'SingleModel',
            'ModelDataUrl': model_data_url
        }
    ],
    ExecutionRoleArn=role,
)

create_model_response["ModelArn"]

### Create endpoint configuration

In [None]:
endpoint_config_response = sm.create_endpoint_config(
    EndpointConfigName=huggingface_epc_name,
    ProductionVariants=[
        {
            'VariantName': 'single-variant',
            'ModelName': huggingface_model_name,
            'ServerlessConfig': {
                'MemorySizeInMB': 6144,
                'MaxConcurrency': 8,
            },
        },
    ],
)

endpoint_config_response['EndpointConfigArn']

### Create endpoint

In [None]:
create_endpoint_response = sm.create_endpoint(
    EndpointName=huggingface_endpoint_name,
    EndpointConfigName=huggingface_epc_name,
)

create_endpoint_response['EndpointArn']

In [None]:
waiter = sm.get_waiter('endpoint_in_service')
waiter.wait(EndpointName=huggingface_endpoint_name)

### Invoke endpoint

In [None]:
import boto3, threading, time, json

sm_rt = boto3.client(service_name='sagemaker-runtime')

In [None]:
# If you want to reuse an existing model and endpoint

#model_data_url = 's3://sagemaker-us-west-2-754289655784/huggingface-pytorch-training-2021-12-07-14-01-17-832/output/model.tar.gz'
#huggingface_endpoint_name ='huggingface-serverless-ep-2021-12-08-08-38-27'

In [None]:
test_data_16 = {'inputs': "The Phantom Menace was a waste of my life. Die, Jar Jar, die!"}

test_data_250 = {'inputs': "Naked but not afraid, a young man roams the forest, growling in all fours. \
He behaves like a beast. To him, this is not a theatrical exercise but the true manifestation of his instincts. \
In Nathalie Biancheri's offbeat drama “Wolf,” he is one in a group of teenagers convinced their fragile human \
bodies don’t correspond with their animal identities. Their condition, described as “species dysphoria,” \
ostracizes them from society.For Jacob (George MacKay), the wolf in question, being admitted into a facility \
where those afflicted receive corrective treatment is a last frontier between fulfilling his parents’ wish for \
normalcy or running wild without remorse.Jacob steps into a pack of fellow patients and meets among several \
others, Rufus (Fionn O'Shea), who thinks of himself as a lovable German Shepherd, and love interest Wildcat \
(Lily-Rose Depp), a long house-trained resident under the thumb of a key staff member. Some of them have a \
hard time adjusting, and get “prop privileges” to wear costumes that bring them closer to their desired form. \
Despite what it entails, the setup is never played for laughs, but the opposite. Their desperation has a deep \
sadness. But for as much writer/director Biancheri pumps copious ideas into this concept, the solemn tone and \
lack of thematic focus renders the overwrought outing underwhelming. A premise like this would have been more \
effective had it been executed with the acidity of someone like director Yorgos Lanthimos, in which the premise \
could unfold as satirical commentary rather than straightforward indignation. "}

In [None]:
tick = time.time()
response = sm_rt.invoke_endpoint(
            EndpointName=huggingface_endpoint_name,
            Body=json.dumps(test_data_16),
            ContentType='application/json'
)
tock = time.time()
print(tock-tick)
print(response["Body"].read())

In [None]:
test_data = test_data_250
num_predictions = 100
num_threads = 8

times=[]

def predict():
    thread_id = threading.get_ident()
    print(f'Thread {thread_id} started.')

    for i in range(num_predictions):
        tick = time.time()
        response = sm_rt.invoke_endpoint(
            EndpointName=huggingface_endpoint_name,
            Body=json.dumps(test_data),
            ContentType='application/json'
        )
        tock = time.time()
        #print(response["Body"].read())
        times.append((thread_id,tock-tick))

for i in range(num_threads):
    threading.Thread(target=predict, daemon=False).start()

In [None]:
len(times)

In [None]:
t = [time for thread_id,time in times]

In [None]:
from matplotlib.pyplot import hist

hist(t, bins=100)

In [None]:
import numpy as np

np.percentile(t, q=[50,90,95,99])

### Cleanup

In [None]:
sm.delete_endpoint(EndpointName=huggingface_endpoint_name)
sm.delete_endpoint_config(EndpointConfigName=huggingface_epc_name)
sm.delete_model(ModelName=huggingface_model_name)