# Huggingface Sagemaker - finetune BERT model
From https://github.com/huggingface/notebooks/blob/main/sagemaker/01_getting_started_pytorch/sagemaker-notebook.ipynb

# Development environment


In [1]:
!pip install datasets[s3]




In [5]:
import sagemaker.huggingface
from sagemaker.huggingface import HuggingFace
import wandb

from datasets import load_dataset
from transformers import AutoTokenizer
from sklearn.metrics import accuracy_score, classification_report

In [6]:
import sagemaker
import boto3
sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
sagemaker role arn: arn:aws:iam::739723034235:role/service-role/AmazonSageMaker-ExecutionRole-20240106T094167
sagemaker bucket: sagemaker-us-east-1-739723034235
sagemaker session region: us-east-1


In [7]:
# wandb login and initialization
wandb.login()
wandb.init(
      # Set the project where this run will be logged
      project="sutd-mlops-project", 
      # We pass a run name (otherwise it’ll be randomly assigned, like sunshine-lollypop-10)
      name=f"experiment_session4_run_1", 
      # Track hyperparameters and run metadata
      config={
          "learning_rate": 2e-5,
          "weight_decay": 0.01,
          "num_train_epochs": 2,
          "train_subsample_size": 1000,
          "architecture": "distilbert",
          "dataset_name": "rotten_tomatoes",
          "model_name": "distilbert-base-uncased",
          "instance": "ml.g4dn.2xlarge"
      })
config = wandb.config


[34m[1mwandb[0m: Currently logged in as: [33moliviamoveon[0m ([33molivia-liu[0m). Use [1m`wandb login --relogin`[0m to force relogin


# Prepare data

In [8]:
# s3 key prefix for the data
s3_prefix = 'samples/datasets/' + config.dataset_name

In [9]:
# load dataset
dataset = load_dataset(config.dataset_name)

# tokenize the dataset
tokenizer = AutoTokenizer.from_pretrained(config.model_name)
tokenized_datasets = dataset.map(
                            lambda examples: tokenizer(examples["text"], padding="max_length", truncation=True), 
                            batched=True)

# train, validation and test dataset
train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(config.train_subsample_size))
eval_dataset = tokenized_datasets["validation"].shuffle(seed=42).select(range(100))
test_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(100))

# set format for pytorch
train_dataset =  train_dataset.rename_column("label", "labels")
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
eval_dataset = eval_dataset.rename_column("label", "labels")
eval_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

In [10]:
# save train_dataset to s3
training_input_path = f's3://{sess.default_bucket()}/{s3_prefix}/train'
train_dataset.save_to_disk(training_input_path)

# save validation to s3
validation_input_path = f's3://{sess.default_bucket()}/{s3_prefix}/validation'
eval_dataset.save_to_disk(validation_input_path)

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]

# Train the model

In [11]:
!pygmentize ./scripts/train_sagemaker.py


[34mfrom[39;49;00m [04m[36mtransformers[39;49;00m [34mimport[39;49;00m AutoModelForSequenceClassification, Trainer, TrainingArguments, AutoTokenizer[37m[39;49;00m
[34mfrom[39;49;00m [04m[36msklearn[39;49;00m[04m[36m.[39;49;00m[04m[36mmetrics[39;49;00m [34mimport[39;49;00m accuracy_score, precision_recall_fscore_support[37m[39;49;00m
[34mfrom[39;49;00m [04m[36mdatasets[39;49;00m [34mimport[39;49;00m load_from_disk[37m[39;49;00m
[34mimport[39;49;00m [04m[36mrandom[39;49;00m[37m[39;49;00m
[34mimport[39;49;00m [04m[36mlogging[39;49;00m[37m[39;49;00m
[34mimport[39;49;00m [04m[36msys[39;49;00m[37m[39;49;00m
[34mimport[39;49;00m [04m[36margparse[39;49;00m[37m[39;49;00m
[34mimport[39;49;00m [04m[36mos[39;49;00m[37m[39;49;00m
[34mimport[39;49;00m [04m[36mtorch[39;49;00m[37m[39;49;00m
[37m[39;49;00m
[34mif[39;49;00m [31m__name__[39;49;00m == [33m"[39;49;00m[33m__main__[39;49;00m[33m"[39;49;00m:[37m[39;49;00m

In [12]:
# hyperparameters, which are passed into the training job
hyperparameters={'epochs': config.num_train_epochs,
                 'train_batch_size': 32,
                 'learning_rate': config.learning_rate,
                 'warmup_steps': 0,
                 'model_name': config.model_name
                 }

In [13]:
huggingface_estimator = HuggingFace(entry_point='train_sagemaker.py',
                            source_dir='./scripts',
                            instance_type=config.instance,
                            instance_count=1,
                            role=role,
                            transformers_version='4.26',
                            pytorch_version='1.13',
                            py_version='py39',
                            hyperparameters = hyperparameters)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [None]:
# starting the train job with our uploaded datasets as input
huggingface_estimator.fit({'train': training_input_path, 'test': validation_input_path})

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: huggingface-pytorch-training-2024-01-12-16-08-03-983


Using provided s3_resource
2024-01-12 16:08:04 Starting - Starting the training job...
2024-01-12 16:08:19 Starting - Preparing the instances for training...

# Deploy the endpoint

In [None]:
predictor = huggingface_estimator.deploy(1, "ml.g4dn.2xlarge")


In [None]:
sentiment_input= {"inputs":" great movie"}

predictor.predict(sentiment_input)

# Test the model

In [None]:
def map_labels(label):
    mapping = {'LABEL_0': 0, 'LABEL_1': 1}
    return mapping[label]

sentiment_input= {"inputs": test_dataset["text"]}
test_output = predictor.predict(sentiment_input)
test_predictions = [map_labels(item['label']) for item in test_output]


In [None]:
# compute accuracy on test set
print(classification_report(dataset['test']['label'], test_predictions))
print(accuracy_score(test_dataset['label'], test_predictions))

In [None]:
# show examples of review and labels
import pandas as pd
df = pd.DataFrame({"Review": test_dataset['text'],
                   "Gold label": test_dataset['label'],
                   "Predicted label": test_predictions})
df.head()

In [None]:
# clean up
predictor.delete_model()
predictor.delete_endpoint()
wandb.finish()

# What to try next
- How does the experience using Sagemaker training job compare to running the training in a notebook? Which mode of working do you prefer and why?
- Watch this workshop on Huggingface and AWS Sagemaker https://huggingface.co/docs/sagemaker/getting-started