In [1]:
!pip install datasets transformers evaluate huggingface_hub rouge_score

Collecting datasets
  Downloading datasets-2.16.1-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19 (from evalua

In [2]:
from datasets import load_dataset

billsum = load_dataset("billsum", split="ca_test")

Downloading readme:   0%|          | 0.00/6.87k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/91.8M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/15.8M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/6.12M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/18949 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3269 [00:00<?, ? examples/s]

Generating ca_test split:   0%|          | 0/1237 [00:00<?, ? examples/s]

In [3]:
billsum = billsum.train_test_split(test_size=0.2)

billsum

DatasetDict({
    train: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 989
    })
    test: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 248
    })
})

In [4]:
billsum["train"][0]["text"]

'The people of the State of California do enact as follows:\n\n\nSECTION 1.\nSection 10095 of the Insurance Code is amended to read:\n10095.\n(a) Within 30 days following the effective date of this chapter, the association shall submit to the commissioner, for his or her review, a proposed plan of operation, consistent with the provisions of this chapter, creating an association consisting of all insurers licensed to write and engaged in writing in this state, on a direct basis, basic property insurance or any component of basic property insurance in homeowners or other dwelling multiperil policies. An insurer described in this subdivision shall be a member of the association and shall remain a member as a condition of its authority to transact those kinds of insurance in this state.\n(b) The proposed plan shall authorize the association to assume and cede reinsurance on risks written by insurers in conformity with the program.\n(c) Under the plan, an insurer shall participate in the w

In [5]:
billsum["train"][0]["summary"]

'Under existing law, the California FAIR (fair access to insurance requirements) Plan Association is a joint reinsurance association of state insurers that is established to, among other things, assist persons in securing basic property insurance for qualified property for which insurance cannot be obtained through the normal insurance market. Existing law requires the association to establish and maintain a toll-free telephone number through which a person may receive assistance in applying for basic property insurance. Existing law requires an insurer member of the plan to provide to an applicant who is denied coverage the toll-free telephone number for the plan for information and assistance in obtaining basic property insurance. Existing law requires an agent or broker transacting basic property insurance to either assist a person in making an application for insurance through the plan or to provide the person with that toll-free telephone number.\nThis bill would additionally requ

In [6]:
billsum

DatasetDict({
    train: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 989
    })
    test: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 248
    })
})

# Prepare dataset

In [7]:
from transformers import AutoTokenizer

checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [8]:
examples = billsum["train"][:3]

inputs = tokenizer(
    examples["text"], truncation=True, max_length=1024,
)

inputs.keys()

dict_keys(['input_ids', 'attention_mask'])

We're going to be using the T5 model, which requires that the text be prefixed with "summarize: "

In [9]:
prefix = "summarize: "
max_text_length = 1024
max_summary_length = 128

def tokenize_examples(examples):
  # add prefix to texts
  texts = [prefix + doc for doc in examples["text"]]
  # tokenize texts
  tokenized_texts = tokenizer(
      texts, truncation=True, max_length=max_text_length,
  )
  # tokenize summaries
  labels = tokenizer(
      text_target=examples["summary"], truncation=True,
      max_length=max_summary_length
  )
  tokenized_texts["labels"] = labels["input_ids"]

  return tokenized_texts

In [10]:
inputs = tokenize_examples(examples)
inputs.keys()

dict_keys(['input_ids', 'attention_mask', 'labels'])

In [11]:
tokenized_datasets = billsum.map(
    tokenize_examples,
    batched=True,
    remove_columns=billsum["train"].features
)

Map:   0%|          | 0/989 [00:00<?, ? examples/s]

Map:   0%|          | 0/248 [00:00<?, ? examples/s]

In [12]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 989
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 248
    })
})

Let's get the model

In [13]:
from transformers import TFAutoModelForSeq2SeqLM

model = TFAutoModelForSeq2SeqLM.from_pretrained(checkpoint)

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [14]:
model.summary()

Model: "tft5_for_conditional_generation"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 shared (Embedding)          multiple                  16449536  
                                                                 
 encoder (TFT5MainLayer)     multiple                  35330816  
                                                                 
 decoder (TFT5MainLayer)     multiple                  41625344  
                                                                 
Total params: 60506624 (230.81 MB)
Trainable params: 60506624 (230.81 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


Now let's prepare a data collator which will handle dynamic padding

In [15]:
from transformers import DataCollatorForSeq2Seq

In [16]:
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding=True,
    return_tensors="tf"
)

In [17]:
examples = [  ]

for example in tokenized_datasets["train"]:
  if len(example["input_ids"]) < 1024 or len(example["labels"]) < 128:
    examples.append(example)
  if len(examples) == 3:
    break

for example in examples:
  text_length = len(example["input_ids"])
  summary_length = len(example["labels"])
  print(f">>> Text tokens length: {text_length}")
  print(f">>> Summary tokens length: {summary_length}\n")

>>> Text tokens length: 1024
>>> Summary tokens length: 52

>>> Text tokens length: 1024
>>> Summary tokens length: 120

>>> Text tokens length: 1024
>>> Summary tokens length: 72



In [18]:
batch = data_collator(examples)

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [19]:
batch["input_ids"].shape # each text will have length of largest text

TensorShape([3, 1024])

In [20]:
batch["labels"].shape # each summary will have length of largest summary

TensorShape([3, 120])

In [21]:
batch["labels"][1]

"""
for the labels, the padding used is -100,
this ensures that the loss for these slots are very low,
because e^-100 is a negligibly small number ~ 9x10^-44,
this ensures that theses slots do not contribute to the loss
"""

'\nfor the labels, the padding used is -100,\nthis ensures that the loss for these slots are very low,\nbecause e^-100 is a negligibly small number ~ 9x10^-44,\nthis ensures that theses slots do not contribute to the loss\n'

Let's prepare our datasets

In [22]:
batch_size=4

tf_train_dataset = model.prepare_tf_dataset(
    tokenized_datasets["train"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=batch_size
)

tf_eval_dataset = model.prepare_tf_dataset(
    tokenized_datasets["test"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=batch_size
)

Let's train our model

In [33]:
from transformers import AdamWeightDecay

optimizer = AdamWeightDecay(
    learning_rate=2e-5,
    weight_decay_rate=0.01
)

In [34]:
model.compile(optimizer=optimizer)

In [36]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [37]:
from transformers.keras_callbacks import PushToHubCallback

push_to_hub_callback = PushToHubCallback(
    output_dir="summarization-t5-small-finetuned-billsum",
    tokenizer=tokenizer,
)

For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
Cloning https://huggingface.co/raj-p/summarization-t5-small-finetuned-billsum into local empty directory.


In [38]:
callbacks = [
    push_to_hub_callback
]

In [39]:
model.evaluate(tf_eval_dataset)



4.810105800628662

In [40]:
num_epochs = 3

model.fit(
    tf_train_dataset,
    validation_data=tf_eval_dataset,
    epochs=num_epochs,
    callbacks=callbacks,
)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x7c1bc278cf40>

We should really use the ROUGE metric to score our summaries.

In [43]:
from transformers import pipeline

hub_model_id = "raj-p/summarization-t5-small-finetuned-billsum"
summarizer = pipeline("summarization", model=hub_model_id)

config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

tf_model.h5:   0%|          | 0.00/374M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at raj-p/summarization-t5-small-finetuned-billsum.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


generation_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/20.7k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [45]:
article = billsum["test"].shuffle().select([0])

In [46]:
article["text"]

['The people of the State of California do enact as follows:\n\n\nSECTION 1.\nSection 739.1 of the Public Utilities Code is amended to read:\n739.1.\n(a) The commission shall continue a program of assistance to low-income electric and gas customers with annual household incomes that are no greater than 200 percent of the federal poverty guideline levels, the cost of which shall not be borne solely by any single class of customer. For one-person households, program eligibility shall be based on two-person household guideline levels. The program shall be referred to as the California Alternate Rates for Energy or CARE program. The commission shall ensure that the level of discount for low-income electric and gas customers correctly reflects the level of need.\n(b) The commission shall establish rates for CARE program participants, subject to both of the following:\n(1) That the commission ensure that low-income ratepayers are not jeopardized or overburdened by monthly energy expenditures

In [47]:
summarizer(article["text"])

Token indices sequence length is longer than the specified maximum sequence length for this model (2137 > 512). Running this sequence through the model will result in indexing errors


[{'summary_text': 'Existing law requires the commission to continue a program of assistance to low-income electric and gas customers with annual household incomes that are no greater than 200 percent of the federal poverty guideline levels, the cost of which shall not be borne solely by any single class of customer. This bill would establish rates for CARE program participants, subject to both of the following: (1) The average effective CARE discount shall be calculated as a weighted average of the CARE discounts provided to individual customers, as determined by the needs assessment conducted pursuant'}]