# Huggingface Sagemaker SDK

## Installation

In [1]:
%%capture

!pip install --upgrade "sagemaker>=2.31.0" "transformers==4.4.2" "datasets[s3]==1.5.0"
!conda install -c conda-forge ipywidgets -y

import IPython
IPython.Application.instance().kernel.do_shutdown(True)

import sagemaker.huggingface

## Permissions

In [2]:
import sagemaker

sess = sagemaker.Session()
sagemaker_session_bucket = None
if sagemaker_session_bucket is None and sess is not None:
    sagemaker_session_bucket = sess.default_bucket()

role = sagemaker.get_execution_role()
sess = sagemaker.Session(default_bucket = sagemaker_session_bucket)

print(f"SageMaker role arn: {role}")
print(f"SageMaker bucket: {sess.default_bucket()}")
print(f"SageMaker session region: {sess.boto_region_name}")

Couldn't call 'get_role' to get Role ARN from role name AmazonSageMaker-ExecutionRole-20201221T131849 to get Role path.
Assuming role was created in SageMaker AWS console, as the name contains `AmazonSageMaker-ExecutionRole`. Defaulting to Role ARN with service-role in path. If this Role ARN is incorrect, please add IAM read permissions to your role or supply the Role Arn directly.


SageMaker role arn: arn:aws:iam::061635907654:role/service-role/AmazonSageMaker-ExecutionRole-20201221T131849
SageMaker bucket: sagemaker-us-east-1-061635907654
SageMaker session region: us-east-1


# Preprocessing

## Tokenization 

In [22]:
%%time

import datasets
from transformers import AutoTokenizer

model_name = "distilbert-base-uncased-finetuned-sst-2-english"

tokenizer = AutoTokenizer.from_pretrained(model_name)

train_dataset, test_dataset = datasets.load_dataset(
    'imdb',
    ignore_verifications = True,
    split = ['train', 'test']
)

def tokenize(batch):
    return tokenizer(batch['text'], padding = 'max_length', truncation = True)

test_ds = test_dataset.shuffle().select(range(10000))

# tokenize dataset
train_ds = train_dataset.map(tokenize)
test_ds = test_ds.map(tokenize)

# set format for pytorch
train_ds = train_ds.rename_column("label", "labels")
train_ds.set_format('torch', columns = ['input_ids', 'attention_mask', 'labels'])

test_ds = test_ds.rename_column("label", "labels")
test_ds.set_format('torch', columns = ['input_ids', 'attention_mask', 'labels'])



HBox(children=(FloatProgress(value=0.0, max=25000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))


CPU times: user 37.9 s, sys: 422 ms, total: 38.3 s
Wall time: 39.8 s


## Uploading data to `sagemaker_session_bucket`

In [23]:
import botocore
from datasets.filesystems import S3FileSystem

s3 = S3FileSystem()

s3_prefix = "datasets/imdb-binary-classification"

training_input_path = f"s3://{sess.default_bucket()}/{s3_prefix}/train"
train_ds.save_to_disk(training_input_path, fs = s3)

# save test_dataset to s3
test_input_path = f"s3://{sess.default_bucket()}/{s3_prefix}/test"
test_ds.save_to_disk(test_input_path, fs = s3)

# Train a Model

In [33]:
from sagemaker.huggingface import HuggingFace

job_name = "imdb-huggingface"

params = {
    "base_job_name": job_name,
    "entry_point": "train.py",
    "source_dir": "./scripts",
    "instance_type": "ml.p3.16xlarge",
    "instance_count": 1,
    "role": role,
    "transformers_version": "4.4.2",
    "pytorch_version": "1.6.0",
    "py_version": "py36"
}

spot_params = {
    "checkpoint_s3_uri": f"s3://{sess.default_bucket()}/{job_name}/checkpoints",
    "use_spot_instances": True,
    "max_wait": 3600,
    "max_run": 660    
}

distributed_params = {
    "instance_count": 2,
    "distribution": {
#         "mpi": {
#             "enabled": True,
#             "processes_per_host" : 1
#         },
        "smdistributed": {
            "dataparallel": {
                "enabled": True
            }
        }
#         "modelparallel": {
#             "enabled": True,
#             "parameters": {
#                 "partitions": 2
#             }
#         }
    }
}

hyperparams = {
    "epochs": 6,
    "train_batch_size": 64,
    "eval_batch_size": 128,
    "model_name": model_name 
}

def use_standard_training():
    return HuggingFace(
        **params,
        hyperparameters = hyperparams
    )

def use_spot():
    return HuggingFace(
        **params,
        **spot_params,
        hyperparameters = {
            **hyperparams,
            "output_dir": "/opt/ml/checkpoints"
        }
    )

def use_spot_distributed():
    return HuggingFace(**{
        **params,
        **spot_params, 
        **distributed_params,
        hyperparameters = {
            **hyperparams,
            "output_dir": "/opt/ml/checkpoints"
        }
    })


def use_distributed():
    return HuggingFace(**{
        **params,
        **distributed_params
    })

estimator = use_spot_distributed()


In [None]:
%%time

estimator.fit({
    "train": training_input_path, 
    "test": test_input_path
})

INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: latest.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating training-job with name: imdb-huggingface-2021-04-28-21-15-53-925


2021-04-28 21:15:54 Starting - Starting the training job...
2021-04-28 21:16:17 Starting - Launching requested ML instancesProfilerReport-1619644554: InProgress
............
2021-04-28 21:18:17 Starting - Preparing the instances for training.........
2021-04-28 21:19:41 Downloading - Downloading input data...
2021-04-28 21:20:24 Training - Downloading the training image...............
2021-04-28 21:22:54 Training - Training image download completed. Training in progress.[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2021-04-28 21:22:50,059 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2021-04-28 21:22:50,137 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[35mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[35mbash: no job control in this shell[0m
[35m2021-04-28 21:

# Load a Model

In [27]:
import tarfile
from sagemaker.s3 import S3Downloader
from sagemaker.estimator import Estimator

_, model_key = sagemaker.s3.parse_s3_url(estimator.model_data)
job_name, *rest = model_key.split("/")
local_path = f"./models/{job_name}"

S3Downloader.download(
    s3_uri = estimator.model_data,
    local_path = local_path,
    sagemaker_session = sess
)

tarfile.open(f'{local_path}/model.tar.gz', mode = 'r:gz').extractall(path = local_path)

# Make Inferences

In [28]:
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
from pprint import pprint

models = [
    AutoModelForSequenceClassification.from_pretrained(model_name),
    AutoModelForSequenceClassification.from_pretrained(local_path)
]

In [29]:
seqs = [
    "Good. The best movie I've ever seen in my life.",
    "Bad. The worst thing ever."
]

for it in models:
    classifier = pipeline(
        "sentiment-analysis", 
        model = it, 
        tokenizer = tokenizer
    )
    result = classifier(seqs)
    
    for i in range(len(result)):
        res = result[i]
        res["label"] = "Positive" if res["label"] == "LABEL_1" else "Negative"
        res["sequence"] = seqs[i]
        
    pprint(result)
    print()

[{'label': 'Negative',
  'score': 0.9998512268066406,
  'sequence': "Good. The best movie I've ever seen in my life."},
 {'label': 'Negative',
  'score': 0.99973464012146,
  'sequence': 'Bad. The worst thing ever.'}]

[{'label': 'Negative',
  'score': 0.9996551275253296,
  'sequence': "Good. The best movie I've ever seen in my life."},
 {'label': 'Negative',
  'score': 0.9996955394744873,
  'sequence': 'Bad. The worst thing ever.'}]

