## Installations

In [1]:
!pip -q install transformers datasets evaluate transformers[torch]
! pip install -U accelerate
!pip install bitsandbytes
!pip install rouge_score
!pip install huggingface-hub

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/480.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m26.0 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/179.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [48]:
import os
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import TrainingArguments, Trainer

from peft import LoraConfig, get_peft_model, TaskType
from peft import PeftModel, PeftConfig

import warnings
warnings.filterwarnings('ignore')

## Load Dataset

In [49]:
from datasets import load_dataset

billsum = load_dataset("billsum", split='train')

In [50]:
billsum

Dataset({
    features: ['text', 'summary', 'title'],
    num_rows: 18949
})

In [51]:
billsum = billsum.train_test_split(test_size=0.2)

In [52]:
billsum["train"][0]

{'text': "SECTION 1. SHORT TITLE.\n\n    This Act may be cited as the ``Presidential Threat Protection Act \nof 2000''.\n\nSEC. 2. REVISION OF SECTION 879 OF TITLE 18, UNITED STATES CODE.\n\n    (a) In General.--Section 879 of title 18, United States Code, is \namended--\n        (1) by striking ``or'' at the end of subsection (a)(2);\n        (2) in subsection (a)(3)--\n            (A) by striking ``the spouse'' and inserting ``a member of \n        the immediate family''; and\n            (B) by inserting ``or'' after the semicolon at the end;\n        (3) by inserting after subsection (a)(3) the following:\n        ``(4) a person protected by the Secret Service under section \n    3056(a)(6);'';\n        (4) in subsection (a)--\n            (A) by striking ``who is protected by the Secret Service as \n        provided by law,''; and\n            (B) by striking ``three years'' and inserting ``5 years''; \n        and\n        (5) in subsection (b)(1)(B)--\n            (A) by inserti

## Tokenization and preprocessing

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-base")

In [12]:
prefix = "summarize: "


def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [13]:
tokenized_billsum = billsum.map(preprocess_function, batched=True)

Map:   0%|          | 0/15159 [00:00<?, ? examples/s]

Map:   0%|          | 0/3790 [00:00<?, ? examples/s]

In [41]:
tokenized_billsum

DatasetDict({
    train: Dataset({
        features: ['text', 'summary', 'title', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 15159
    })
    test: Dataset({
        features: ['text', 'summary', 'title', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3790
    })
})

In [15]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [38]:
import evaluate

rouge = evaluate.load("rouge")

In [46]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [20]:
# Token masked due to sensitive info
!huggingface-cli login --token xyz



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
The token `podcast` has been saved to /home/khairnar.as/.cache/huggingface/stored_tokens
Your token has been saved to /home/khairnar.as/.cache/huggingface/token
Login successful.
The current active token is: `podcast`


## Model Training

In [None]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
from peft import get_peft_model, LoraConfig, TaskType

# Lora config
lora_config = LoraConfig(
    r=64,                      
    lora_alpha=64,               
    lora_dropout=0.01,           
    bias="all",                  
    task_type=TaskType.SEQ_2_SEQ_LM,
    target_modules=["q_proj", "v_proj"]   
)


lora_model = get_peft_model(model, lora_config)

training_args = Seq2SeqTrainingArguments(
    output_dir="lora_t5_medium",
    eval_strategy="epoch",
    auto_find_batch_size=True,
    learning_rate=2e-4,
    per_device_train_batch_size=16, 
    per_device_eval_batch_size=16,   
    gradient_accumulation_steps = 1,
    save_steps = 0,
    weight_decay=0.01,
    logging_steps = 25,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=False,
    bf16=True,
    push_to_hub=True,
    max_grad_norm = 0.3,
    max_steps = -1,
    warmup_ratio = 0.03,
    group_by_length = True,
    lr_scheduler_type = "cosine"
)


trainer = Seq2SeqTrainer(
    model=lora_model,
    args=training_args,
    train_dataset=tokenized_billsum["train"],
    eval_dataset=tokenized_billsum["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer = tokenizer
)

trainer.train()

In [23]:
lora_model.print_trainable_parameters()

trainable params: 7,078,656 || all params: 229,981,440 || trainable%: 3.0779
