#  Legal document Summarization with Hugging Face FlanT5 on Amazon Sagemaker

In [15]:
# Number of parameters for flan-t5 family: small 80M, base 250M, large 780M, xl 3B, xxl 11B
model_id = "google/flan-t5-large"

# https://huggingface.co/datasets/billsum
dataset_id = "billsum"

# Setup

In [16]:
!pip -q install transformers datasets sagemaker --upgrade

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [17]:
!pip -q install widgetsnbextension ipywidgets

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [18]:
import sagemaker

print(sagemaker.__version__)

sess = sagemaker.Session()
bucket = sess.default_bucket()

INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole


2.151.0


In [19]:
import transformers
import datasets

print(transformers.__version__)
print(datasets.__version__)

4.28.1
2.12.0


# Preprocessing

## Load dataset

In [20]:
from datasets import load_dataset, load_from_disk

dataset = load_dataset(dataset_id)
dataset



  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 18949
    })
    test: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 3269
    })
    ca_test: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 1237
    })
})

## Preprocess dataset 

In [21]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_id)

prefix = "summarize: "
input_max_length = 1024
output_max_length = 128


def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=input_max_length, truncation=True)
    labels = tokenizer(
        text_target=examples["summary"], max_length=output_max_length, truncation=True
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [22]:
tokenized_dataset = dataset.map(
    preprocess_function, batched=True, remove_columns=["title", "text", "summary"]
)



Map:   0%|          | 0/3269 [00:00<?, ? examples/s]



In [None]:
#tokenized_dataset.save_to_disk(f"billsum-t5-tokenized")

# Upload processed dataset to S3

In [23]:
from datasets.filesystems import S3FileSystem

s3 = S3FileSystem()

s3_prefix = "huggingface/billsum-t5-summarization"

dataset_input_path = "s3://{}/{}".format(bucket, s3_prefix)
train_input_path = "{}/train".format(dataset_input_path)
valid_input_path = "{}/validation".format(dataset_input_path)

print(dataset_input_path)
print(train_input_path)
print(valid_input_path)

s3://sagemaker-us-east-1-231891361855/huggingface/billsum-t5-summarization
s3://sagemaker-us-east-1-231891361855/huggingface/billsum-t5-summarization/train
s3://sagemaker-us-east-1-231891361855/huggingface/billsum-t5-summarization/validation


In [24]:
tokenized_dataset["train"].save_to_disk(train_input_path, fs=s3)
tokenized_dataset["test"].save_to_disk(valid_input_path, fs=s3)

Saving the dataset (0/1 shards):   0%|          | 0/18949 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3269 [00:00<?, ? examples/s]

In [None]:
#%%sh -s $dataset_input_path
#aws s3 ls --recursive $1

In [None]:
#train_ds = load_from_disk(train_input_path)
#valid_ds = load_from_disk(valid_input_path)

# Fine-tune on SageMaker with a Hugging Face Deep Learning Container

In [25]:
!pygmentize train.py

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[34mimport[39;49;00m [04m[36margparse[39;49;00m[37m[39;49;00m
[34mimport[39;49;00m [04m[36mlogging[39;49;00m[37m[39;49;00m
[34mimport[39;49;00m [04m[36mos[39;49;00m[37m[39;49;00m
[37m[39;49;00m
[34mimport[39;49;00m [04m[36mevaluate[39;49;00m[37m[39;49;00m
[34mimport[39;49;00m [04m[36mnumpy[39;49;00m [34mas[39;49;00m [04m[36mnp[39;49;00m[37m[39;49;00m
[34mfrom[39;49;00m [04m[36mdatasets[39;49;00m [34mimport[39;49;00m load_from_disk[37m[39;49;00m
[34mfrom[39;49;00m [04m[36mtransformers[39;49;00m [34mimport[39;49;00m ([37m[39;49;00m
    AutoModelForSeq2SeqLM,[37m[39;49;00m
    AutoTokenizer,[37m[39;49;00m
    DataCollatorForSeq2Seq,[37m[39;49;00m

In [26]:
hyperparameters = {
    "epochs": 1,
    "learning-rate": 1e-6,
    "train-batch-size": 1,
    "eval-batch-size": 8,
    "model-name": model_id,
}

In [27]:
from sagemaker.huggingface import HuggingFace

huggingface_estimator = HuggingFace(
    role=sagemaker.get_execution_role(),
    # Fine-tuning script
    entry_point="train.py",
    dependencies=["requirements.txt"],
    hyperparameters=hyperparameters,
    # Infrastructure
    transformers_version="4.26.0",
    pytorch_version="1.13.1",
    py_version="py39",
    instance_type="ml.p3dn.24xlarge",
    instance_count=1,
    distribution={"smdistributed": {"dataparallel": {"enabled": True}}},
)

INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole


In [28]:
huggingface_estimator.fit({"train": train_input_path, "valid": valid_input_path})

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: huggingface-pytorch-training-2023-04-30-05-38-26-789


Using provided s3_resource
2023-04-30 05:38:27 Starting - Starting the training job...
2023-04-30 05:38:44 Starting - Preparing the instances for training............
2023-04-30 05:40:49 Downloading - Downloading input data
2023-04-30 05:40:49 Training - Downloading the training image.....................
2023-04-30 05:44:05 Training - Training image download completed. Training in progress....[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2023-04-30 05:44:47,906 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2023-04-30 05:44:47,973 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2023-04-30 05:44:47,983 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2023-04-30 05:44:47,985 sagemaker_pytorch_container.training INFO     Invoking SMDataParallel[0m
[

In [29]:
huggingface_estimator.model_data

's3://sagemaker-us-east-1-231891361855/huggingface-pytorch-training-2023-04-30-05-38-26-789/output/model.tar.gz'

In [30]:
dataset['test'][10]

{'text': "SECTION 1. SHORT TITLE.\n\n    This Act may be cited as the ``Outer Continental Shelf Revenue \nSharing Act of 2005''.\n\nSEC. 2. OUTER CONTINENTAL SHELF REVENUE SHARING.\n\n    Section 31 of the Outer Continental Shelf Lands Act (43 U.S.C. \n1356a) is amended--\n            (1) in subsection (a)--\n                    (A) by striking paragraph (7);\n                    (B) by redesignating paragraphs (8), (9), and (10) \n                as paragraphs (7), (8), and (9), respectively;\n                    (C) in paragraph (8) (as redesignated by \n                subparagraph (B)), by striking subparagraph (B) and \n                inserting the following:\n                    ``(B) Inclusion.--The term `producing State' \n                includes any State that begins production on a leased \n                tract on or after the date of enactment of the Outer \n                Continental Shelf Revenue Sharing Act of 2005, \n                regardless of whether the leased t

# Deploy on SageMaker with a Hugging Face Deep Learning Container

In [31]:
huggingface_predictor = huggingface_estimator.deploy(
    initial_instance_count=1, instance_type="ml.p3.2xlarge"
)

INFO:sagemaker:Creating model with name: huggingface-pytorch-training-2023-04-30-06-18-09-213
INFO:sagemaker:Creating endpoint-config with name huggingface-pytorch-training-2023-04-30-06-18-09-213
INFO:sagemaker:Creating endpoint with name huggingface-pytorch-training-2023-04-30-06-18-09-213


-----------!

In [32]:
test_data = {"inputs": f"{prefix}: {dataset['test'][10]}"}

In [33]:
prediction = huggingface_predictor.predict(test_data)
print(prediction)

[{'generated_text': 'A bill to amend the Outer Continental Shelf Lands Act to allow certain coastal States to share'}]


In [34]:
huggingface_predictor.delete_endpoint()

INFO:sagemaker:Deleting endpoint configuration with name: huggingface-pytorch-training-2023-04-30-06-18-09-213
INFO:sagemaker:Deleting endpoint with name: huggingface-pytorch-training-2023-04-30-06-18-09-213
