In [1]:
from datasets import load_dataset
billsum = load_dataset("billsum", split = "ca_test")


In [2]:
billsum = billsum.train_test_split(test_size = 0.2)

In [3]:
billsum["train"][0]

{'text': 'The people of the State of California do enact as follows:\n\n\nSECTION 1.\nThe Legislature finds and declares all of the following:\n(a) California has the eighth largest economy in the world, and its laws have a far-reaching impact on individuals, entities, and organizations within the state and throughout the world.\n(b) Because of its extraordinary economic impact and leadership on timely issues, California’s statutory framework and legal structures have a national and global impact.\n(c) Rapid technological and societal advances require the development of public policy in new and evolving areas.\n(d) State government officials must make informed policy decisions about issues that have increasingly complex and interrelated legal components.\n(e) California is home to some of the world’s most prestigious universities and law schools.\n(f) California is currently facing one of the largest surpluses of recent law school graduates in the nation, and the unique education and t

In [4]:
from transformers import AutoTokenizer
checkpoint = "google-t5/t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [5]:
prefix = "summarize: "
def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length = 1024, truncation = True)
    labels = tokenizer(text_target = examples["summary"], max_length = 128, truncation = True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs
    

In [6]:
tokenized_billsum = billsum.map(preprocess_function, batched = True)

Map:   0%|          | 0/989 [00:00<?, ? examples/s]

Map:   0%|          | 0/248 [00:00<?, ? examples/s]

In [7]:
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer, model=checkpoint)


In [11]:
import evaluate
import numpy as np
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens = True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens = True)

    result = rouge.compute(predictions = decoded_preds, references = decoded_labels, use_stemmer=True)
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}




In [12]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

training_args = Seq2SeqTrainingArguments(
    output_dir = "f1_t5_billsum_model",
    eval_strategy = "epoch",
    learning_rate = 2e-5,
    per_device_train_batch_size = 1,
    per_device_eval_batch_size = 1,
    weight_decay = 0.01,
    save_total_limit = 3,
    num_train_epochs = 4,
    predict_with_generate = True, 
    fp16 = False, 
    push_to_hub = True,
    bf16= False,
)


In [13]:
trainer = Seq2SeqTrainer(
    model = model,
    args = training_args, 
    train_dataset = tokenized_billsum["train"],
    eval_dataset = tokenized_billsum["test"],
    processing_class = tokenizer, 
    data_collator = data_collator, 
    compute_metrics = compute_metrics,
    
)

trainer.train()

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,3.0212,2.336903,0.2101,0.104,0.177,0.1769,20.0
2,2.4896,2.253265,0.2107,0.1066,0.1784,0.1787,20.0
3,2.4628,2.217875,0.2126,0.1101,0.1805,0.1806,20.0
4,2.409,2.208604,0.2109,0.1068,0.1779,0.1779,20.0


  [2m2025-08-23T13:23:30.422788Z[0m [33m WARN[0m  [33mStatus Code: 502. Retrying..., [1;33mrequest_id[0m[33m: ""[0m
    [2;3mat[0m /Users/runner/work/xet-core/xet-core/cas_client/src/http_client.rs:220





TrainOutput(global_step=3956, training_loss=2.5421505765075993, metrics={'train_runtime': 1166.2271, 'train_samples_per_second': 3.392, 'train_steps_per_second': 3.392, 'total_flos': 1070364382593024.0, 'train_loss': 2.5421505765075993, 'epoch': 4.0})

In [14]:
trainer.push_to_hub()


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ..._t5_billsum_model/training_args.bin: 100%|##########| 5.91kB / 5.91kB            

  ..._t5_billsum_model/model.safetensors:   0%|          |  552kB /  242MB            

CommitInfo(commit_url='https://huggingface.co/prxshetty/f1_t5_billsum_model/commit/1f4e35c6ba89bc4c452147e1386732c2031030ff', commit_message='End of training', commit_description='', oid='1f4e35c6ba89bc4c452147e1386732c2031030ff', pr_url=None, repo_url=RepoUrl('https://huggingface.co/prxshetty/f1_t5_billsum_model', endpoint='https://huggingface.co', repo_type='model', repo_id='prxshetty/f1_t5_billsum_model'), pr_revision=None, pr_num=None)

In [18]:
text = "summarize: The Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. It's the most aggressive action on tackling the climate crisis in American history, which will lift up American workers and create good-paying, union jobs across the country. It'll lower the deficit and ask the ultra-wealthy and corporations to pay their fair share. And no one making under $400,000 per year will pay a penny more in taxes."
from transformers import pipeline
summarizer = pipeline("summarization", model = "prxshetty/f1_t5_billsum_model")
summarizer(text)

Device set to use mps:0
Your max_length is set to 200, but your input_length is only 103. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=51)


[{'summary_text': "The Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. It's the most aggressive action on tackling the climate crisis in American history, which will lift up American workers and create good-paying, union jobs across the country."}]

In [20]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("prxshetty/f1_t5_billsum_model")
inputs = tokenizer(text, return_tensors="pt").input_ids

In [None]:
from transformers import AutoModelForSeq2SeqLM
model = AutoModelForSeq2SeqLM.from_pretrained("prxshetty/f1_t5_billsum_model")
outputs = model.generate(inputs, max_new_tokens = 200, do_sample = False)
tokenizer.decode(outputs[0], skip_special_tokens = True)
