# Huggingface Sagemaker SDK


## Scenarios:

1. Basic data prep, training, deployment, inference
1. Advanced training: spot, distributed
1. Model Registry comparison between model trained with small training dataset size compared to larger


## Installation

In [2]:
%%capture

!pip install --upgrade "sagemaker>=2.31.0" "transformers==4.4.2" "datasets[s3]==1.5.0"
!conda install -c conda-forge ipywidgets -y

import IPython
IPython.Application.instance().kernel.do_shutdown(True)

import sagemaker.huggingface

## Permissions

In [1]:
import boto3
import sagemaker

session = sagemaker.Session()
bucket = session.default_bucket()
role = sagemaker.get_execution_role()

print(f"SageMaker role arn: {role}")
print(f"SageMaker bucket: {session.default_bucket()}")
print(f"SageMaker session region: {session.boto_region_name}")

Couldn't call 'get_role' to get Role ARN from role name AmazonSageMaker-ExecutionRole-20201221T131849 to get Role path.
Assuming role was created in SageMaker AWS console, as the name contains `AmazonSageMaker-ExecutionRole`. Defaulting to Role ARN with service-role in path. If this Role ARN is incorrect, please add IAM read permissions to your role or supply the Role Arn directly.


SageMaker role arn: arn:aws:iam::061635907654:role/service-role/AmazonSageMaker-ExecutionRole-20201221T131849
SageMaker bucket: sagemaker-us-east-1-061635907654
SageMaker session region: us-east-1


# Preprocessing

## Tokenization 

In [2]:
%%time

import pandas
import datasets
from transformers import AutoTokenizer

model_name = "distilbert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_name)

CPU times: user 1.11 s, sys: 175 ms, total: 1.29 s
Wall time: 2.55 s


In [13]:
# train_dataset, test_dataset = datasets.load_dataset(
#     "imdb",
#     ignore_verifications = True,
#     split = ["train", "test"]
# )

train_dataset, test_dataset = datasets.load_dataset(
    "imdb", 
    ignore_verifications = True,
    split = ["train", "test"]
)

Reusing dataset imdb (/root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/90099cb476936b753383ba2ae6ab2eae419b2e87f71cd5189cb9c8e5814d12a3)


In [14]:
tokenize = lambda batch: tokenizer(
    batch["text"], 
    padding = "max_length", 
    truncation = True
)
# test_ds = test_dataset.shuffle().select(range(10000))

train_ds = train_dataset.shuffle().map(tokenize)
test_ds = test_dataset.shuffle().map(tokenize)

try:
    train_ds = train_ds.rename_column("label", "labels")
    test_ds = test_ds.rename_column("label", "labels")
except:
    pass

pandas.DataFrame(train_ds[0:5])

HBox(children=(FloatProgress(value=0.0, max=25000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=25000.0), HTML(value='')))




Unnamed: 0,attention_mask,input_ids,labels,text
0,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 1045, 1005, 2222, 2022, 14969, 1012, 104...",0,I'll be blunt. I'm not one for politically cor...
1,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 2023, 3185, 2003, 1037, 2200, 3532, 3535...",0,This movie is a very poor attempt to make mone...
2,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 1998, 2043, 1045, 3422, 4532, 3165, 2386...",0,"And when I watch Sarah Silverman, I get the sa..."
3,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 2732, 5867, 22214, 1011, 1015, 4076, 199...",1,Stargate SG-1 follows and expands upon the Egy...
4,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 2339, 2106, 2023, 3185, 8246, 11088, 102...",1,Why did this movie fail commercially? It's got...


In [15]:
print(len(train_dataset))

25000


In [16]:
columns = ["input_ids", "attention_mask", "labels"]
train_ds.set_format("torch", columns = columns)
test_ds.set_format("torch", columns = columns)

## Uploading data to `sagemaker_session_bucket`

In [17]:
import botocore
from datasets.filesystems import S3FileSystem

s3 = S3FileSystem()

s3_prefix = "datasets/imdb-binary-classification"

training_input_path = f"s3://{bucket}/{s3_prefix}/train"
train_ds.save_to_disk(training_input_path, fs = s3)

test_input_path = f"s3://{bucket}/{s3_prefix}/test"
test_ds.save_to_disk(test_input_path, fs = s3)

# Train a Model

In [18]:
from sagemaker.huggingface import HuggingFace

job_name = "imdb-huggingface"

metric_definitions = [
    {
      "Name": "loss", 
      "Regex": "'loss': ([0-9]+(.|e\-)[0-9]+),?"
    },
    {
      "Name": "learning_rate", 
      "Regex": "'learning_rate': ([0-9]+(.|e\-)[0-9]+),?"
    },
    {
      "Name": "eval_loss", 
      "Regex": "'eval_loss': ([0-9]+(.|e\-)[0-9]+),?"
    },
    {
      "Name": "eval_accuracy", 
      "Regex": "'eval_accuracy': ([0-9]+(.|e\-)[0-9]+),?"
    },
    {
      "Name": "eval_f1", 
      "Regex": "'eval_f1': ([0-9]+(.|e\-)[0-9]+),?"
    },
    {
      "Name": "eval_precision", 
      "Regex": "'eval_precision': ([0-9]+(.|e\-)[0-9]+),?"
    },
    {
      "Name": "eval_recall", 
      "Regex": "'eval_recall': ([0-9]+(.|e\-)[0-9]+),?"
    },
    {
      "Name": "eval_runtime", 
      "Regex": "'eval_runtime': ([0-9]+(.|e\-)[0-9]+),?"
    },
    {
      "Name": "eval_samples_per_second", 
      "Regex": "'eval_samples_per_second': ([0-9]+(.|e\-)[0-9]+),?"
    },
    {
      "Name": "epoch", 
      "Regex": "'epoch': ([0-9]+(.|e\-)[0-9]+),?"
    }
]

params = {
    "base_job_name": job_name,
    "enable_sagemaker_metrics": True,
    "entry_point": "train.py",
    "instance_count": 1,
    "instance_type": "ml.p3.16xlarge",
    "py_version": "py36",
    "pytorch_version": "1.6.0",
    "role": role,
    "source_dir": "./scripts",
    "transformers_version": "4.4.2"
}

spot_params = {
    "checkpoint_s3_uri": f"s3://{bucket}/{job_name}/checkpoints",
    "use_spot_instances": True,
    "max_wait": 3600,
    "max_run": 3600
}

dataparallel_params = {
    "instance_count": 2,
    "distribution": {
        "smdistributed": {
            "dataparallel": {
                "enabled": True
            }
        },
        "mpi": {
            "enabled": True,
            "processes_per_host" : 2
        }
    }
}

modelparallel_params = {
    "instance_count": 2,
    "distribution": {
        "smdistributed": {
            "modelparallel": {
                "enabled": True,
                "parameters": {
                    "microbatches": 4,
                    "placement_strategy": "spread",
                    "pipeline": "interleaved",
                    "optimize": "speed",
                    "partitions": 4,
                    "ddp": True
                }
            }
        },
        "mpi": {
            "enabled": True,
            "processes_per_host" : 2
        }
    }
}


hyperparams = {
    "epochs": 6,
    "eval_batch_size": 128,
    "model_name": model_name,
    "train_batch_size": 64
}

def use_standard_training():
    return HuggingFace(
        **params,
        hyperparameters = hyperparams,
        metric_definitions = metric_definitions
    )

def use_spot():
    return HuggingFace(
        **params,
        **spot_params,
        metric_definitions = metric_definitions,
        hyperparameters = {
            **hyperparams,
            "output_dir": "/opt/ml/checkpoints"
        }
    )

def use_spot_distributed(distributed_params = dataparallel_params):
    return HuggingFace(**{
        **params,
        **spot_params, 
        **distributed_params,
        "metric_definitions": metric_definitions,
        "hyperparameters": {
            **hyperparams,
            "output_dir": "/opt/ml/checkpoints"
        }
    })


def use_distributed(distributed_params = dataparallel_params):
    return HuggingFace(**{
        **params,
        **distributed_params,
        "metric_definitions": metric_definitions
    })

In [None]:
%%time

inputs = { 
    "train": training_input_path, 
    "test": test_input_path 
}

estimators = [it() for it in (use_standard_training,)]
for it in estimators:
    it.fit(inputs, wait = False)

2021-05-05 20:29:36 Starting - Starting the training job...
2021-05-05 20:30:02 Starting - Launching requested ML instancesProfilerReport-1620246576: InProgress
............
2021-05-05 20:32:03 Starting - Preparing the instances for training.........
2021-05-05 20:33:29 Downloading - Downloading input data
2021-05-05 20:33:29 Training - Downloading the training image..................
2021-05-05 20:36:28 Training - Training image download completed. Training in progress.[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2021-05-05 20:36:28,589 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2021-05-05 20:36:28,666 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2021-05-05 20:36:34,906 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2021-05-05 20:36:35,347 sagemaker-

# Load a Model

In [10]:
from sagemaker import TrainingJobAnalytics

for it in estimators:
    df = TrainingJobAnalytics(training_job_name = it.latest_training_job.name).dataframe()
    display(df[["metric_name", "value"]].groupby("metric_name").max())



Unnamed: 0_level_0,value
metric_name,Unnamed: 1_level_1
epoch,0.895
learning_rate,2.500005
loss,0.689975


In [11]:
import time
from sagemaker.utils import name_from_base
from sagemaker.pytorch import PyTorchModel
from sagemaker.predictor import Predictor
from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import JSONDeserializer

class SentimentAnalysis(Predictor):
    def __init__(self, endpoint_name, sagemaker_session):
        super().__init__(
            endpoint_name, 
            sagemaker_session = sagemaker_session, 
            serializer = JSONSerializer(), 
            deserializer = JSONDeserializer()
        )

        
names = []
for _ in estimators:
    names.append(name_from_base("imdb-huggingface"))
    time.sleep(1)

models = [
    PyTorchModel(
        name = name,
        role = role, 
        model_data = estimator.model_data,
        source_dir = "./scripts",
        entry_point = "torchserve-predictor.py",
        framework_version = "1.6.0",
        py_version = "py36",
        predictor_cls = SentimentAnalysis
    ) for name, estimator in zip(names, estimators)
]

predictors = [
    model.deploy(
        initial_instance_count = 1, 
        instance_type = "ml.m5.large",
        endpoint_name = name,
        wait = False
    ) for name, model in zip(names, models)
]

ClientError: An error occurred (404) when calling the HeadObject operation: Not Found

# Make Inferences

In [108]:
inputs = [
    "Willow is the greatest movie that has ever lived.",
    "The Notebook is ironically depressing.",
    "My cat's breath smells like cat food."
]

for predictor in predictors:
    print(predictor.endpoint_name)
    for it in inputs:
        prediction = predictor.predict({"text": it})
        print(f'    {prediction}: {it}')

imdb-huggingface-2021-05-05-14-06-19-051
    POSITIVE: Willow is the greatest movie that has ever lived.
    NEGATIVE: The Notebook is ironically depressing.
    NEGATIVE: My cat's breath smells like cat food.
imdb-huggingface-2021-05-05-14-06-20-052
    POSITIVE: Willow is the greatest movie that has ever lived.
    NEGATIVE: The Notebook is ironically depressing.
    NEGATIVE: My cat's breath smells like cat food.


# Clean Up

In [109]:
try:
    for i in range(len(models)):
        predictors[i].delete_endpoint()
        models[i].delete_model()
except:
    display("Already deleted")