# Summarizing legal documents with Hugging Face and Amazon Sagemaker

In [1]:
# Number of parameters for flan-t5 family: small 80M, base 250M, large 780M, xl 3B, xxl 11B
model_id = "google/flan-t5-xl"

# https://huggingface.co/datasets/abisee/cnn_dailymail
dataset_name, dataset_version = "cnn_dailymail", "3.0.0"

# Setup

In [2]:
!pip -q install transformers datasets sagemaker --upgrade

In [3]:
!pip -q install widgetsnbextension ipywidgets

In [4]:
import sagemaker

print(sagemaker.__version__)

sess = sagemaker.Session()
# bucket = sess.default_bucket()
# bucket = "styx-nlp-datasets"

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
2.221.1


In [5]:
import transformers
import datasets

print(transformers.__version__)
print(datasets.__version__)

4.41.2
2.19.1


# Preprocessing

## Load dataset

In [6]:
from datasets import load_dataset, load_from_disk

dataset = load_dataset(dataset_name, dataset_version)
dataset

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})

## Preprocess dataset 

In [7]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_id)

prefix = "summarize: "
input_max_length = 2048
output_max_length = 512


def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["article"]]
    model_inputs = tokenizer(inputs, max_length=input_max_length, truncation=True)
    labels = tokenizer(
        text_target=examples["highlights"], max_length=output_max_length, truncation=True
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [9]:
tokenized_dataset = dataset.map(
    preprocess_function, batched=True, remove_columns=["article", "highlights", "id"]
)

Map:   0%|          | 0/287113 [00:00<?, ? examples/s]

Map:   0%|          | 0/13368 [00:00<?, ? examples/s]

Map:   0%|          | 0/11490 [00:00<?, ? examples/s]

In [None]:
# tokenized_dataset.save_to_disk(f"cnn_dailymail-t5-tokenized")

# Upload processed dataset to S3

In [10]:
import boto3
import os

s3_client = boto3.client('s3')
bucket = "summary-model-data"
s3_prefix = "huggingface/cnn_dailymail-t5-summarization"

dataset_input_path = "s3://{}/{}".format(bucket, s3_prefix)
train_input_path = "{}/train".format(dataset_input_path)
valid_input_path = "{}/validation".format(dataset_input_path)

print(dataset_input_path)
print(train_input_path)
print(valid_input_path)

s3://summary-model-data/huggingface/cnn_dailymail-t5-summarization
s3://summary-model-data/huggingface/cnn_dailymail-t5-summarization/train
s3://summary-model-data/huggingface/cnn_dailymail-t5-summarization/validation


In [11]:
# Save tokenized dataset locally first
local_train_path = "local_train_data"
local_valid_path = "local_valid_data"

tokenized_dataset["train"].save_to_disk(local_train_path)
tokenized_dataset["test"].save_to_disk(local_valid_path)

# Upload the local files to S3
for root, dirs, files in os.walk(local_train_path):
    for file in files:
        s3_client.upload_file(
            os.path.join(root, file),
            bucket,
            os.path.join(s3_prefix, 'train', os.path.relpath(os.path.join(root, file), local_train_path))
        )

for root, dirs, files in os.walk(local_valid_path):
    for file in files:
        s3_client.upload_file(
            os.path.join(root, file),
            bucket,
            os.path.join(s3_prefix, 'validation', os.path.relpath(os.path.join(root, file), local_valid_path))
        )

# Verify by listing the uploaded files
response = s3_client.list_objects_v2(Bucket=bucket, Prefix=s3_prefix)
for obj in response.get('Contents', []):
    print(obj['Key'])

Saving the dataset (0/4 shards):   0%|          | 0/287113 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/11490 [00:00<?, ? examples/s]

huggingface/cnn_dailymail-t5-summarization/train/data-00000-of-00004.arrow
huggingface/cnn_dailymail-t5-summarization/train/data-00001-of-00004.arrow
huggingface/cnn_dailymail-t5-summarization/train/data-00002-of-00004.arrow
huggingface/cnn_dailymail-t5-summarization/train/data-00003-of-00004.arrow
huggingface/cnn_dailymail-t5-summarization/train/dataset_info.json
huggingface/cnn_dailymail-t5-summarization/train/state.json
huggingface/cnn_dailymail-t5-summarization/validation/data-00000-of-00001.arrow
huggingface/cnn_dailymail-t5-summarization/validation/dataset_info.json
huggingface/cnn_dailymail-t5-summarization/validation/state.json


In [None]:
# from datasets.filesystems import S3FileSystem

# s3 = S3FileSystem()

# s3_prefix = "huggingface/cnn_dailymail-t5-summarization"

# dataset_input_path = "s3://{}/{}".format(bucket, s3_prefix)
# train_input_path = "{}/train".format(dataset_input_path)
# valid_input_path = "{}/validation".format(dataset_input_path)

# print(dataset_input_path)
# print(train_input_path)
# print(valid_input_path)

In [None]:
# tokenized_dataset["train"].save_to_disk(train_input_path, fs=s3)
# tokenized_dataset["test"].save_to_disk(valid_input_path, fs=s3)

In [None]:
#%%sh -s $dataset_input_path
#aws s3 ls --recursive $1

In [None]:
#train_ds = load_from_disk(train_input_path)
#valid_ds = load_from_disk(valid_input_path)

# Fine-tune on SageMaker with a Hugging Face Deep Learning Container

In [13]:
!pygmentize train.py

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34mimport[39;49;00m [04m[36margparse[39;49;00m[37m[39;49;00m
[34mimport[39;49;00m [04m[36mlogging[39;49;00m[37m[39;49;00m
[34mimport[39;49;00m [04m[36mos[39;49;00m[37m[39;49;00m
[34mimport[39;49;00m [04m[36mtorch[39;49;00m[37m[39;49;00m
[37m[39;49;00m
[34mimport[39;49;00m [04m[36mevaluate[39;49;00m[37m[39;49;00m
[34mimport[39;49;00m [04m[36mnumpy[39;49;00m [34mas[39;49;00m [04m[36mnp[39;49;00m[37m[39;49;00m
[34mfrom[39;49;00m [04m[36mdatasets[39;49;00m [34mimport[39;49;00m load_from_disk[37m[39;49;00m
[34mfrom[39;49;00m [04m[36mtransformers[39;49;00m [34mimport[39;49;00m ([37m[39;49;00m
    AutoModelForSeq2SeqLM,[37m[39;49;00m
    AutoTokenizer,[37m[39;49;00m
    DataCollatorForSeq2Seq,[37m[39;49;00m
    Seq2SeqTrainer,[37m[39;49;00m
    Seq2SeqTrainingArguments,[37m[39;49;00m
    BitsAndBytesConfig[37m[39;49;00m
)[37m[39;49;00m
[34mfrom[39;49;00m [04m[36mpeft[39;49;00m [34mimport[39;49;00m LoraCon

In [20]:
hyperparameters = {
    "epochs": 2,
    "learning-rate": 1e-6,
    "train-batch-size": 1,
    "eval-batch-size": 8,
    "model-name": model_id,
    "save-total-limit": 3,
    "load-best-model-at-end": True,
    "save-strategy": "epoch",
    "evaluation-strategy": "epoch"
}

In [21]:
from sagemaker.huggingface import HuggingFace

huggingface_estimator = HuggingFace(
    role=sagemaker.get_execution_role(),
    # Fine-tuning script
    entry_point="train.py",
    dependencies=["requirements.txt"],
    hyperparameters=hyperparameters,
    # Infrastructure
    transformers_version="4.28.1",
    pytorch_version="2.0.0",
    py_version="py310",
    instance_type="ml.g5.xlarge",
    instance_count=1,
    use_spot_instances=True,
    max_run=86400, # 24 hours
    max_wait=86400,
)

In [22]:
huggingface_estimator.fit({"train": train_input_path, "valid": valid_input_path})

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: huggingface-pytorch-training-2024-06-01-21-15-02-130


2024-06-01 21:15:02 Starting - Starting the training job...
2024-06-01 21:15:17 Starting - Preparing the instances for training...
2024-06-01 21:15:46 Downloading - Downloading input data...
2024-06-01 21:16:32 Downloading - Downloading the training image.................................
2024-06-01 21:21:43 Training - Training image download completed. Training in progress.[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2024-06-01 21:21:53,388 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2024-06-01 21:21:53,405 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2024-06-01 21:21:53,415 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2024-06-01 21:21:53,422 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2024-06-01 

KeyboardInterrupt: 

In [None]:
huggingface_estimator.model_data

# Copy model and predict

In [None]:
from peft import PeftModel, PeftConfig
from transformers import AutoModelForSeq2SeqLM
from datasets import load_dataset

model_id = "juliensimon/flan-t5-large-billsum-qlora"
base_model_id = "google/flan-t5-large"

config = PeftConfig.from_pretrained(model_id)
base_model = AutoModelForSeq2SeqLM.from_pretrained(base_model_id)
model = PeftModel.from_pretrained(base_model, model_id)
model.eval()

In [None]:
dataset = load_dataset("billsum")
sample = dataset['test'][123]

In [None]:
sample['text']

In [None]:
input_ids = tokenizer(sample["text"], return_tensors="pt", truncation=True).input_ids

outputs = model.generate(input_ids=input_ids, max_new_tokens=64, do_sample=True, top_p=0.8)

tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0]