In [6]:
import sagemaker.huggingface

In [7]:
import sagemaker

sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

role = sagemaker.get_execution_role()
sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

sagemaker role arn: arn:aws:iam::839052460858:role/MLOps
sagemaker bucket: sagemaker-us-east-1-839052460858
sagemaker session region: us-east-1


In [8]:
from datasets import load_dataset
from transformers import AutoTokenizer

# tokenizer used in preprocessing
tokenizer_name = 'distilbert-base-uncased'

# dataset used
dataset_name = 'imdb'

# s3 key prefix for the data
s3_prefix = 'samples/datasets/imdb'

In [9]:
# load dataset
dataset = load_dataset(dataset_name)

# download tokenizer
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

# tokenizer helper function
def tokenize(batch):
    return tokenizer(batch['text'], padding='max_length', truncation=True)

# load dataset
train_dataset, test_dataset = load_dataset('imdb', split=['train', 'test'])
test_dataset = test_dataset.shuffle().select(range(10000)) # smaller the size for test dataset to 10k 


# tokenize dataset
train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

# set format for pytorch
train_dataset =  train_dataset.rename_column("label", "labels")
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset = test_dataset.rename_column("label", "labels")
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

Reusing dataset imdb (/home/ec2-user/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)


  0%|          | 0/3 [00:00<?, ?it/s]

Reusing dataset imdb (/home/ec2-user/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)


  0%|          | 0/2 [00:00<?, ?it/s]

Loading cached processed dataset at /home/ec2-user/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1/cache-14936b1fa1c6aff6.arrow


  0%|          | 0/10 [00:00<?, ?ba/s]

In [11]:
import botocore
from datasets.filesystems import S3FileSystem

s3 = S3FileSystem()

# save train_dataset to s3
training_input_path = f's3://{sess.default_bucket()}/{s3_prefix}/train'
train_dataset.save_to_disk(training_input_path,fs=s3)

# save test_dataset to s3
test_input_path = f's3://{sess.default_bucket()}/{s3_prefix}/test'
test_dataset.save_to_disk(test_input_path,fs=s3)

In [13]:
from sagemaker.huggingface import HuggingFace

# hyperparameters, which are passed into the training job
hyperparameters={'epochs': 1,
                 'train_batch_size': 32,
                 'model_name':'distilbert-base-uncased'
                 }

In [14]:
huggingface_estimator = HuggingFace(entry_point='train.py',
                            source_dir='./scripts',
                            base_job_name='huggingface-sdk-extension',
                            instance_type='ml.m5.2xlarge',
                            instance_count=1,
                            transformers_version='4.11',
                            pytorch_version='1.9',
                            py_version='py38',
                            role=role,
                            image_uri='839052460858.dkr.ecr.us-east-1.amazonaws.com/hf-pytorch-cpu:1.0',
                            hyperparameters = hyperparameters)

INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole


In [None]:
huggingface_estimator.fit({'train': training_input_path, 'test': test_input_path})

INFO:sagemaker:Creating training-job with name: huggingface-sdk-extension-2022-12-23-11-07-35-169


2022-12-23 11:07:35 Starting - Starting the training job...
2022-12-23 11:07:50 Starting - Preparing the instances for training......
2022-12-23 11:08:53 Downloading - Downloading input data...
2022-12-23 11:09:33 Training - Training image download completed. Training in progress.....[34m2022-12-23 11:09:57,653 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2022-12-23 11:09:57,654 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2022-12-23 11:09:57,668 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2022-12-23 11:09:57,670 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2022-12-23 11:09:57,681 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2022-12-23 11:09:57,683 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m


[34m#015  0%|          | 0/782 [00:00<?, ?it/s]#015  0%|          | 1/782 [00:19<4:16:24, 19.70s/it]#015  0%|          | 2/782 [00:39<4:15:16, 19.64s/it]#015  0%|          | 3/782 [00:58<4:14:26, 19.60s/it]#015  1%|          | 4/782 [01:18<4:15:56, 19.74s/it]#015  1%|          | 5/782 [01:38<4:15:42, 19.75s/it]#015  1%|          | 6/782 [01:58<4:15:11, 19.73s/it]#015  1%|          | 7/782 [02:18<4:16:26, 19.85s/it]#015  1%|          | 8/782 [02:38<4:15:48, 19.83s/it]#015  1%|          | 9/782 [03:00<4:26:19, 20.67s/it]#015  1%|▏         | 10/782 [03:20<4:22:18, 20.39s/it]#015  1%|▏         | 11/782 [03:40<4:19:29, 20.19s/it]#015  2%|▏         | 12/782 [04:00<4:20:04, 20.27s/it]#015  2%|▏         | 13/782 [04:20<4:17:49, 20.12s/it]#015  2%|▏         | 14/782 [04:40<4:15:57, 20.00s/it]#015  2%|▏         | 15/782 [05:00<4:16:20, 20.05s/it]#015  2%|▏         | 16/782 [05:20<4:14:51, 19.96s/it]#015  2%|▏         | 17/782 [05:39<4:13:47, 19.91s/it]#015  2%|▏         | 18/782 [05:59<4:13:42,

KeyboardInterrupt: 

In [16]:
huggingface_estimator = HuggingFace(entry_point='train.py',
                            source_dir='./scripts',
                            base_job_name='huggingface-sdk-extension',
                            instance_type='ml.m5.2xlarge',
                            instance_count=1,
                            transformers_version='4.11',
                            pytorch_version='1.9',
                            py_version='py38',
                            role=role,
                            image_uri='839052460858.dkr.ecr.us-east-1.amazonaws.com/hf-pytorch-gpu:latest',
                            hyperparameters = hyperparameters)

INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole


In [17]:
huggingface_estimator.fit({'train': training_input_path, 'test': test_input_path})

INFO:sagemaker:Creating training-job with name: huggingface-sdk-extension-2022-12-23-17-31-18-160


2022-12-23 17:31:18 Starting - Starting the training job...
2022-12-23 17:31:33 Starting - Preparing the instances for training......
2022-12-23 17:32:35 Downloading - Downloading input data...
2022-12-23 17:32:55 Training - Downloading the training image.....................
2022-12-23 17:36:26 Training - Training image download completed. Training in progress....[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34msed: can't read changehostname.c: No such file or directory[0m
[34mgcc: error: changehostname.c: No such file or directory[0m
[34mgcc: fatal error: no input files[0m
[34mcompilation terminated.[0m
[34mgcc: error: changehostname.o: No such file or directory[0m
[34mERROR: ld.so: object '/libchangehostname.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.[0m
[34m2022-12-23 17:37:06,938 sagemaker-training-toolkit INFO     Imported framework sagemaker_py

[34m0%|          | 1/782 [00:20<4:30:17, 20.77s/it][0m


KeyboardInterrupt: 