In [None]:
!pip install transformers datasets evaluate rouge_score
!pip install accelerate -U

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m33.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.14.4-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.3/519.3 kB[0m [31m48.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m30.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizer

In [None]:
from datasets import load_dataset

billsum = load_dataset("cnn_dailymail", "3.0.0", split="train")

Downloading builder script:   0%|          | 0.00/8.33k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/9.88k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/15.1k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/159M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/376M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/12.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/661k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/572k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

In [None]:
billsum = billsum.train_test_split(test_size=0.1)

In [None]:
from transformers import AutoTokenizer

checkpoint = "t5-small"
# checkpoint = "sshleifer/distilbart-cnn-12-6"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [None]:
prefix = "summarize: "


def preprocess_function(examples):
    """
    Preprocesses examples for input to the sequence-to-sequence model.

    Args:
        examples (dict): A dictionary containing input article and corresponding highlights.

    Returns:
        dict: A dictionary containing model inputs including tokenized and preprocessed data.

    This function takes a dictionary of examples, where each example contains an "article" and its corresponding
    "highlights". It preprocesses the data by tokenizing the input articles, adding a prefix, and truncating if
    necessary. It also tokenizes the target highlights and attaches them as "labels". The preprocessed data is then
    returned in a dictionary format suitable for input to the sequence-to-sequence model.
    """
    # Tokenize input articles with the added prefix and truncate if necessary
    inputs = [prefix + doc for doc in examples["article"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    # Tokenize target highlights and attach them as "labels"
    labels = tokenizer(text_target=examples["highlights"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_billsum = billsum.map(preprocess_function, batched=True)

Map:   0%|          | 0/10341 [00:00<?, ? examples/s]

Map:   0%|          | 0/1149 [00:00<?, ? examples/s]

In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [None]:
import evaluate

rouge = evaluate.load("rouge")


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [None]:
import numpy as np


def compute_metrics(eval_pred):
    """
    Computes evaluation metrics for generated summaries compared to reference summaries.

    Args:
        eval_pred (tuple): A tuple containing model-generated predictions and corresponding labels.

    Returns:
        dict: A dictionary containing computed evaluation metrics, including ROUGE scores and average generation length.

    This function takes a tuple of model-generated predictions and corresponding labels. It decodes the predictions and
    labels, computes ROUGE scores by comparing the generated summaries with the reference summaries, and calculates the
    average generation length. The computed evaluation metrics are returned in a dictionary format with rounded values.
    """

    predictions, labels = eval_pred

    # Decode predictions and labels to text
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Compute ROUGE scores using decoded predictions and labels
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    # Calculate average generation length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [None]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
tokenized_billsum_train = tokenized_billsum['train'].shuffle(seed=42).select(range(100000))
tokenized_billsum_test = tokenized_billsum['test'].shuffle(seed=42).select(range(10000))
tokenized_billsum_train

Dataset({
    features: ['article', 'highlights', 'id', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 100000
})

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="my_awesome_billsum_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=2,
    predict_with_generate=True,
    fp16=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_billsum_train,
    eval_dataset=tokenized_billsum_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,1.8697,1.678559,0.2396,0.111,0.199,0.199,18.9779
2,1.8567,1.673672,0.2392,0.1103,0.1982,0.1983,18.9805


TrainOutput(global_step=12500, training_loss=1.8739453515625, metrics={'train_runtime': 2939.0943, 'train_samples_per_second': 68.048, 'train_steps_per_second': 4.253, 'total_flos': 5.41367205888e+16, 'train_loss': 1.8739453515625, 'epoch': 2.0})

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')
PATH = "gdrive/MyDrive/Saved_Models/"

Mounted at /content/gdrive


In [None]:
trainer.save_model(PATH)

In [None]:
# Repeating everything to run for the saved model

!pip install transformers datasets evaluate rouge_score
!pip install accelerate -U

from datasets import load_dataset

# billsum = load_dataset("billsum", split="ca_test")
billsum = load_dataset("cnn_dailymail", "3.0.0", split="train")

billsum = billsum.train_test_split(test_size=0.1)

billsum2 = load_dataset("cnn_dailymail", "3.0.0", split="test")

billsum2 = billsum2.train_test_split(test_size=0.9999)


from transformers import AutoTokenizer
from google.colab import drive
drive.mount('/content/gdrive')
PATH = "gdrive/MyDrive/Saved_Models/"

checkpoint = PATH

# checkpoint = "t5-small"
# checkpoint = "sshleifer/distilbart-cnn-12-6"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

prefix = "summarize: "
def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["article"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    labels = tokenizer(text_target=examples["highlights"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_billsum = billsum.map(preprocess_function, batched=True)
tokenized_billsum2 = billsum2.map(preprocess_function, batched=True)


from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

import evaluate

rouge = evaluate.load("rouge")

import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer


model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)



Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.14.4-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.3/519.3 kB[0m [31m24.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m27.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizer

Downloading builder script:   0%|          | 0.00/8.33k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/9.88k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/15.1k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/159M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/376M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/12.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/661k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/572k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

Mounted at /content/gdrive


Map:   0%|          | 0/258401 [00:00<?, ? examples/s]

Map:   0%|          | 0/28712 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/11489 [00:00<?, ? examples/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [None]:
tokenized_billsum_train = tokenized_billsum['train'].shuffle().select(range(250000))
tokenized_billsum_test = tokenized_billsum2['test'].shuffle().select(range(11000))
# tokenized_billsum_train

training_args = Seq2SeqTrainingArguments(
    output_dir="my_awesome_billsum_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True,
    fp16=True,
    # push_to_hub=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    # train_dataset=tokenized_billsum["train"],
    # eval_dataset=tokenized_billsum["test"],
    train_dataset=tokenized_billsum_train,
    eval_dataset=tokenized_billsum_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()



Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,1.8278,1.68042,0.2473,0.118,0.2046,0.2046,18.9985


TrainOutput(global_step=15625, training_loss=1.8378835439453125, metrics={'train_runtime': 5007.848, 'train_samples_per_second': 49.922, 'train_steps_per_second': 3.12, 'total_flos': 6.76697418718249e+16, 'train_loss': 1.8378835439453125, 'epoch': 1.0})