## Installation

In [1]:
!pip install 'sagemaker' 'transformers' 'datasets[s3]' 's3fs' --upgrade --quiet

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
awscli 1.22.97 requires botocore==1.24.42, but you have botocore 1.23.24 which is incompatible.[0m


## Environment set-up

In [2]:
import ast
import botocore
import os
import pandas as pd
import sagemaker
import sagemaker.huggingface

from datasets import Dataset
from datasets.filesystems import S3FileSystem
from sagemaker.huggingface import HuggingFace
from transformers import AutoTokenizer

## AWS permissions

In [3]:
sess = sagemaker.Session()
sagemaker_session_bucket = None
if sagemaker_session_bucket is None and sess is not None:
    sagemaker_session_bucket = sess.default_bucket()

role = sagemaker.get_execution_role()
sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f'Role ARN: {role}')
print(f'Bucket: {sess.default_bucket()}')
print(f'Region: {sess.boto_region_name}')

Role ARN: arn:aws:iam::583579242701:role/sagemaker
Bucket: sagemaker-us-east-1-583579242701
Region: us-east-1


## Pre-processing

In [4]:
# Model/tokenizer checkpoint
CHECKPOINT = 'sshleifer/distilbart-cnn-12-6'

# Data directory
DATA_DIR = './data/'

# S3 key prefix for the data
S3_PREFIX = 'datasets'

In [5]:
def get_prefix(checkpoint):
    """Returns a prefix to be prepended to the input text."""
    if 't5' in checkpoint:
        return 'summarize: '
    else:
        return ''
    
def load_paragraph_dataset(split):
    """Loads a paragraph-level dataset for a specific split."""
    file_path = os.path.join(DATA_DIR, split + '.csv')
    df = pd.read_csv(file_path)
    df = df[['text', 'summary']]
    df['summary'] = df['summary'].apply(lambda row: ' '.join(ast.literal_eval(row)))
    dataset = Dataset.from_pandas(df)
    return dataset
    
def preprocess_batch(batch):
    """Preprocesses a batch of inputs for fine-tuning."""
    inputs = [prefix + text for text in batch['text']]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(batch['summary'], max_length=512, truncation=True)
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

def preprocess(dataset):
    """Preprocesses a dataset for fine-tuning."""
    dataset = dataset.map(preprocess_batch, batched=True)
    dataset = dataset.remove_columns(['text', 'summary'])
    return dataset

In [6]:
# Download tokenizer
tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT)
prefix = get_prefix(CHECKPOINT)

# Load datasets
train_dataset = load_paragraph_dataset('train')
valid_dataset = load_paragraph_dataset('valid')

# Preprocessing
train_dataset = preprocess(train_dataset)
valid_dataset = preprocess(valid_dataset)

Downloading:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.76k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

  0%|          | 0/117 [00:00<?, ?ba/s]

  0%|          | 0/16 [00:00<?, ?ba/s]

### Upload the data to `sagemaker_session_bucket`

In [7]:
s3 = S3FileSystem()

# Save the train dataset to S3
train_input_path = f's3://{sess.default_bucket()}/{S3_PREFIX}/train'
train_dataset.save_to_disk(train_input_path, fs=s3)

# Save the valid dataset to S3
valid_input_path = f's3://{sess.default_bucket()}/{S3_PREFIX}/valid'
valid_dataset.save_to_disk(valid_input_path, fs=s3)

## Fine-tuning job

In [8]:
!pygmentize ./scripts/finetune_booksum.py

[34mimport[39;49;00m [04m[36mrandom[39;49;00m
[34mimport[39;49;00m [04m[36mlogging[39;49;00m
[34mimport[39;49;00m [04m[36msys[39;49;00m
[34mimport[39;49;00m [04m[36margparse[39;49;00m
[34mimport[39;49;00m [04m[36mos[39;49;00m
[34mimport[39;49;00m [04m[36mtorch[39;49;00m

[34mfrom[39;49;00m [04m[36mdatasets[39;49;00m [34mimport[39;49;00m Dataset, load_from_disk, load_metric
[34mfrom[39;49;00m [04m[36mfunctools[39;49;00m [34mimport[39;49;00m partial
[34mfrom[39;49;00m [04m[36mtransformers[39;49;00m [34mimport[39;49;00m (
    AutoTokenizer,
    BartForConditionalGeneration,
    PegasusForConditionalGeneration,
    T5ForConditionalGeneration, 
    Seq2SeqTrainer, 
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq
)


[34mdef[39;49;00m [32mload_pretrained_model[39;49;00m(model_name):
    [33m"""Loads a pre-trained conditional generation model from the Huggingface hub."""[39;49;00m
    [34mif[39;49;00m [

### Create an Estimator and start a training job

In [15]:
# Hyperparameters passed into the training job
hyperparameters={
    'model_name': 'sshleifer/distilbart-cnn-12-6',
    'train_batch_size': 4,
    'eval_batch_size': 8,
    'epochs': 2,
    'push_to_hub': True,
    'hub_model_id': 'distilbart-cnn-12-6-booksum',
    'hub_strategy': 'every_save',
    'hub_token': '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'
}

In [16]:
# Create the HF estimator
hf_estimator = HuggingFace(
    entry_point='./scripts/finetune_booksum.py',
    source_dir="./",
    instance_type='ml.g5.xlarge',
    instance_count=1,
    role=role,
    transformers_version='4.17.0',
    pytorch_version='1.10.2',
    py_version='py38',
    hyperparameters=hyperparameters,
)

In [None]:
# Run the training job
hf_estimator.fit({'train': train_input_path, 'valid': valid_input_path})

2022-07-04 21:22:32 Starting - Starting the training job...
2022-07-04 21:22:56 Starting - Preparing the instances for trainingProfilerReport-1656969752: InProgress
......
2022-07-04 21:24:02 Downloading - Downloading input data...
2022-07-04 21:24:32 Training - Downloading the training image........................
2022-07-04 21:28:24 Training - Training image download completed. Training in progress.[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
  "class": algorithms.Blowfish,[0m
[34m2022-07-04 21:28:26,808 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2022-07-04 21:28:26,828 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2022-07-04 21:28:26,836 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2022-07-04 21:28:28,627 sagemaker-training-toolkit INFO     Invoking

### Estimator Parameters

In [18]:
print(f'Container image used for training job: \n{hf_estimator.image_uri}\n')
print(f'S3 URI where the trained model is located: \n{hf_estimator.model_data}\n')
print(f'Latest training job name for this estimator: \n{hf_estimator.latest_training_job.name}\n')

Container image used for training job: 
None

S3 URI where the trained model is located: 
s3://sagemaker-us-east-1-583579242701/huggingface-pytorch-training-2022-07-04-21-22-16-684/output/model.tar.gz

Latest training job name for this estimator: 
huggingface-pytorch-training-2022-07-04-21-22-16-684

