In [1]:
%%capture
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7 datasets evaluate rouge_score

In [2]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

In [3]:
base_model ="UBC-NLP/AraT5v2-base-1024"

In [4]:
ds = load_dataset("FahdSeddik/AGS-Corpus")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/4.68k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/190M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/141467 [00:00<?, ? examples/s]

In [5]:
ds = ds['train'].train_test_split(test_size=0.2)

In [6]:
compute_dtype = getattr(torch, "float16")

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)

In [7]:
# Load base model
model = AutoModelForSeq2SeqLM.from_pretrained(
    base_model,
    quantization_config=quant_config,
    device_map={"": 0}
)
model.config.use_cache = False
model.config.pretraining_tp = 1



config.json:   0%|          | 0.00/699 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.47G [00:00<?, ?B/s]

  return torch.load(checkpoint_file, map_location="cpu")


generation_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

In [8]:


tokenizer = AutoTokenizer.from_pretrained("UBC-NLP/AraT5v2-base-1024")


tokenizer_config.json:   0%|          | 0.00/2.37k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/2.35M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/8.40M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [9]:
prefix = "summarize: "


def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [10]:
tokenized_ds = ds.map(preprocess_function, batched=True)

Map:   0%|          | 0/113173 [00:00<?, ? examples/s]

Map:   0%|          | 0/28294 [00:00<?, ? examples/s]

In [11]:
#Ensure the data is aligned with model requirements, group samples in batches of same length(padding)
#dynamically pad the sentences to the longest length in a batch during collation, instead of padding the whole dataset to the maximum length.

from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model='UBC-NLP/AraT5v2-base-1024')

In [12]:
# Load LoRA configuration
lora_config  = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="SEQ_2_SEQ_LM",
)

In [13]:
from peft import get_peft_model


In [14]:
model = get_peft_model(model, lora_config)


In [15]:
import evaluate

rouge = evaluate.load("rouge")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [16]:
import numpy as np

def compute_metrics(eval_pred):
    # Unpack the predictions and labels from the eval_pred tuple
    predictions, labels = eval_pred

    # Decode the predictions from token IDs to human-readable text, skipping special tokens like [PAD], [CLS], etc.
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Replace all instances of -100 in the labels with the padding token ID, so they can be decoded properly
    # -100 is used to mark tokens that should be ignored during loss computation
    #ex: padded tokens, so it doesn't affect the computations
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    # Decode the labels from token IDs to human-readable text, skipping special tokens
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Compute the ROUGE score by comparing the decoded predictions with the decoded labels
    # ROUGE is a common metric for evaluating the quality of generated text against reference text
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    # Calculate the length of each prediction by counting non-padding tokens in the predictions
    # This helps in understanding the typical length of the generated text
    '''
    pred != tokenizer.pad_token_id: Creates a boolean array where each position is True if the token is not a padding token, and False otherwise.
np.count_nonzero(...): Counts the number of True values in each sequence, which corresponds to the number of non-padding tokens.
    '''
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]

    # Calculate the average length of the generated sequences across the batch. and add it to the result dict

    result["gen_len"] = np.mean(prediction_lens)

    # Return the results, rounding the values to 4 decimal places for readability
    return {k: round(v, 4) for k, v in result.items()}


In [17]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer


training_args = Seq2SeqTrainingArguments(
    output_dir="my_awesome_sum_model",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    save_steps=0,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="cosine"
)

In [18]:
trainer = Seq2SeqTrainer(
    model=model,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    args=training_args)


In [19]:
trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
25,29.5963
50,29.0506
75,29.6913
100,28.7425
125,28.795
150,26.1112
175,25.0063
200,20.9337
225,18.0644
250,15.2206


TrainOutput(global_step=28294, training_loss=2.7177119320792396, metrics={'train_runtime': 8128.1204, 'train_samples_per_second': 13.924, 'train_steps_per_second': 3.481, 'total_flos': 3.439127530308096e+16, 'train_loss': 2.7177119320792396, 'epoch': 1.0})

In [21]:
# Access the trainer's log history to see the last ROUGE evaluation
for log in trainer.state.log_history:
    if "eval_rougeL" in log:  # You can replace with any other specific ROUGE score
        print(f"ROUGE score from previous evaluation: {log}")


In [26]:
trainer.save_model("./path_to_save_model")  # Replace with your desired save path
tokenizer.save_pretrained("./path_to_save_model")


('./path_to_save_model/tokenizer_config.json',
 './path_to_save_model/special_tokens_map.json',
 './path_to_save_model/spiece.model',
 './path_to_save_model/added_tokens.json',
 './path_to_save_model/tokenizer.json')

In [27]:
import shutil

# Compress the model folder into a zip file
shutil.make_archive("trained_model", 'zip', "./path_to_save_model")

# Now, download the zip file (works in Jupyter or Colab environments)
from google.colab import files
files.download("trained_model.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>