In [8]:
!pip install datasets transformers rouge_score



In [9]:
from datasets import load_dataset

billsum = load_dataset("billsum", split="ca_test")

Found cached dataset billsum (C:/Users/Lenovo/.cache/huggingface/datasets/billsum/default/3.0.0/75cf1719d38d6553aa0e0714c393c74579b083ae6e164b2543684e3e92e0c4cc)


In [10]:
billsum = billsum.train_test_split(test_size=0.2)

In [11]:
billsum["train"][0]


{'text': 'The people of the State of California do enact as follows:\n\n\nSECTION 1.\nArticle 2 (commencing with Section 18706) is added to Chapter 3 of Part 10.2 of Division 2 of the Revenue and Taxation Code, to read:\nArticle  2. Special Olympics Fund\n18706.\n(a) Any individual may designate on the tax return that a contribution in excess of the tax liability, if any, be made to the Special Olympics Fund established by Section 18707 to be used by the Special Olympics Northern California and the Special Olympics Southern California.\n(b) The contribution shall be in full dollar amounts and may be made individually by each signatory on the joint return.\n(c) A designation under subdivision (a) shall be made for any taxable year on the original return for that taxable year, and once made shall be irrevocable. If payments and credits reported on the return, together with any other credits associated with the individual’s account, do not exceed the individual’s tax liability, the return

In [13]:
from transformers import AutoTokenizer


In [14]:
tokenizer = AutoTokenizer.from_pretrained("t5-small")

Downloading:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [16]:
prefix = "summarize: "


def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [17]:
tokenized_text = billsum.map(preprocess_function, batched = True)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [18]:

from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

Downloading:   0%|          | 0.00/242M [00:00<?, ?B/s]

In [20]:
from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [22]:

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=10,
    fp16=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_text["train"],
    eval_dataset=tokenized_text["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

Using cuda_amp half precision backend
The following columns in the training set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: summary, title, text. If summary, title, text are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 989
  Num Epochs = 10
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 620
  Number of trainable parameters = 60506624
You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: summary, title, text. If summary, title, text are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 248
  Batch size = 16
The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: summary, title, text. If summary, title, text are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 248
  Batch size = 16
The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: summary, title, text. If summary, title, text are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Ru

TrainOutput(global_step=620, training_loss=2.7110961421843496, metrics={'train_runtime': 394.5496, 'train_samples_per_second': 25.067, 'train_steps_per_second': 1.571, 'total_flos': 2677060833116160.0, 'train_loss': 2.7110961421843496, 'epoch': 10.0})

In [23]:
billsum["test"]


Dataset({
    features: ['text', 'summary', 'title'],
    num_rows: 248
})

In [24]:
import torch
def generate_answer(batch):
    inputs_dict = tokenizer(batch["text"], padding="max_length", max_length=8192, return_tensors="pt", truncation=True)
    input_ids = inputs_dict.input_ids.to("cuda")
    attention_mask = inputs_dict.attention_mask.to("cuda")
    global_attention_mask = torch.zeros_like(attention_mask)
    global_attention_mask[:, 0] = 1
    predicted_abstract_ids = model.generate(input_ids, attention_mask=attention_mask)
    batch["predicted_summary"] = tokenizer.batch_decode(predicted_abstract_ids, skip_special_tokens=True)
    return batch

result = billsum["test"].map(generate_answer, batched=True, batch_size=1)

  0%|          | 0/248 [00:00<?, ?ba/s]

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.0.dev0"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.0.dev0"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.0.dev0"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.0.dev0"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.0.dev0"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.0.dev0"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_

In [25]:
import pandas as pd
from datasets import load_metric
result_df = pd.DataFrame(result)
result_df.to_csv("result.csv")

# load rouge
rouge = load_metric("rouge")

print("Rouge1 Result:", rouge.compute(predictions=result["predicted_summary"], references=result["summary"], rouge_types=["rouge1"])["rouge1"].mid)
print("Rouge2 Result:", rouge.compute(predictions=result["predicted_summary"], references=result["summary"], rouge_types=["rouge2"])["rouge2"].mid)
print("RougeL Result:", rouge.compute(predictions=result["predicted_summary"], references=result["summary"], rouge_types=["rougeL"])["rougeL"].mid)



  rouge = load_metric("rouge")


Downloading builder script:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

Rouge1 Result: Score(precision=0.6637445828207211, recall=0.021610183190142303, fmeasure=0.0404904812324681)
Rouge2 Result: Score(precision=0.3277210077109273, recall=0.01025779487826682, fmeasure=0.019334454861029908)
RougeL Result: Score(precision=0.6240469953631125, recall=0.01967006531762252, fmeasure=0.03688502159604318)
