## Utility Functions

In [None]:
def convert_examples_to_features(example_batch):
    # Preprocessing the dialogue
    input_encodings = tokenizer(example_batch['dialogue'] , max_length = 1024, truncation = True )

    # Preprocessing the summary considering them as target
    with tokenizer.as_target_tokenizer():
        target_encodings = tokenizer(example_batch['summary'], max_length = 128, truncation = True )

    # Returning the preprocessed data as dict which is a requirement
    return {
        'input_ids' : input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'labels': target_encodings['input_ids']
    }

In [None]:
def generate_batch_sized_chunks(list_of_elements, batch_size):
    
    """split the dataset into smaller batches that we can process simultaneously
    Yield successive batch-sized chunks from list_of_elements."""

    for i in range(0, len(list_of_elements), batch_size):
        yield list_of_elements[i : i + batch_size]

In [None]:
def calculate_metric_on_test_ds(dataset,
                                metric,
                                model,
                                tokenizer,
                                batch_size=16,
                                device=device,
                                column_text="article",
                                column_summary="highlights"):

    # Generating batches of input and output
    article_batches = list(generate_batch_sized_chunks(dataset[column_text], batch_size))
    target_batches = list(generate_batch_sized_chunks(dataset[column_summary], batch_size))

    # Iterating through batches
    # Using tqdm for progress bar 
    for article_batch, target_batch in tqdm(
        # Zipping input and target
        zip(article_batches, target_batches), total=len(article_batches)):

        # Tokenizing a batch
        inputs = tokenizer(article_batch, max_length=1024,  truncation=True,
                        padding="max_length", return_tensors="pt")

        # Calculating the summary
        summaries = model.generate(input_ids=inputs["input_ids"].to(device),
                         attention_mask=inputs["attention_mask"].to(device),
                         length_penalty=0.8, num_beams=8, max_length=128)
        
        ''' parameter for length penalty ensures that the model does not generate sequences that are too long. '''

        # Finally, we decode the generated texts, replace the token, and add the decoded texts with the references to the metric.
        decoded_summaries = [tokenizer.decode(s,
                                              skip_special_tokens=True,
                                              clean_up_tokenization_spaces=True) for s in summaries]

        decoded_summaries = [d.replace("", " ") for d in decoded_summaries]

        metric.add_batch(predictions=decoded_summaries, references=target_batch)

    # Finally compute and return the ROUGE scores.
    score = metric.compute()
    return score

## Environment Setup

In [1]:
# Checking the GPU availability

!nvidia-smi

Fri Apr 19 10:54:39 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   62C    P8              10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [2]:
# Installing some packages

!pip install transformers[sentencepiece] datasets sacrebleu rouge_score py7zr -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.7/106.7 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.6/67.6 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m388.9/388.9 kB[0m [31m29.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m

In [3]:
# Uninstalling and then reinstalling some packages

!pip uninstall -y transformers accelerate
!pip install transformers accelerate

Collecting accelerate
  Downloading accelerate-0.29.3-py3-none-any.whl (297 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/297.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.4/297.6 kB[0m [31m2.0 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m297.0/297.6 kB[0m [31m3.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.6/297.6 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  U

## Model Fine Tuning

### Imports and Setup

In [4]:
import pandas as pd

import torch

from datasets import (load_dataset, 
                      load_from_disk, 
                      load_metric)

from transformers import (AutoModelForSeq2SeqLM,
                          AutoTokenizer,
                          DataCollatorForSeq2Seq,
                          pipeline,
                          TrainingArguments,
                          Trainer)

from tqdm import tqdm

In [5]:
# Setting device to GPU if available else CPU (with PyTorch)

device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [6]:
# Loading specific checkpoint of PEGASUS model
model_ckpt = "google/pegasus-cnn_dailymail"

# Loading the corresponding required tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/88.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

### Data Import and Analysis

In [7]:
# Download and unzip the dataset for fine tuning

!wget https://github.com/sg13041995/Datasets/raw/main/textSummarizer_samsun.zip
!unzip textSummarizer_samsun.zip

--2024-04-19 10:58:03--  https://github.com/sg13041995/Datasets/raw/main/textSummarizer_samsun.zip
Resolving github.com (github.com)... 20.205.243.166
Connecting to github.com (github.com)|20.205.243.166|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/sg13041995/Datasets/main/textSummarizer_samsun.zip [following]
--2024-04-19 10:58:03--  https://raw.githubusercontent.com/sg13041995/Datasets/main/textSummarizer_samsun.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7903594 (7.5M) [application/zip]
Saving to: ‘textSummarizer_samsun.zip’


2024-04-19 10:58:04 (133 MB/s) - ‘textSummarizer_samsun.zip’ saved [7903594/7903594]

Archive:  textSummarizer_samsun.zip
  inflating: samsum-test.csv      

In [8]:
# Loading the dataset from disk and explore

dataset_samsum = load_from_disk('samsum_dataset')
dataset_samsum

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
})

In [9]:
type(dataset_samsum)

datasets.dataset_dict.DatasetDict

In [10]:
dataset_samsum.keys()

dict_keys(['train', 'test', 'validation'])

In [11]:
dataset_samsum.values()

dict_values([Dataset({
    features: ['id', 'dialogue', 'summary'],
    num_rows: 14732
}), Dataset({
    features: ['id', 'dialogue', 'summary'],
    num_rows: 819
}), Dataset({
    features: ['id', 'dialogue', 'summary'],
    num_rows: 818
})])

In [12]:
dataset_samsum["train"]

Dataset({
    features: ['id', 'dialogue', 'summary'],
    num_rows: 14732
})

In [13]:
type(dataset_samsum["train"])

In [14]:
dataset_samsum['train'].column_names

['id', 'dialogue', 'summary']

In [15]:
dataset_samsum["train"][0]

{'id': '13818513',
 'dialogue': "Amanda: I baked  cookies. Do you want some?\r\nJerry: Sure!\r\nAmanda: I'll bring you tomorrow :-)",
 'summary': 'Amanda baked cookies and will bring Jerry some tomorrow.'}

In [16]:
type(dataset_samsum["train"][0])

dict

In [17]:
dataset_samsum["train"][0]["id"]

'13818513'

In [18]:
print(dataset_samsum["train"][0]["dialogue"])

Amanda: I baked  cookies. Do you want some?
Jerry: Sure!
Amanda: I'll bring you tomorrow :-)


In [19]:
print(dataset_samsum["train"][0]["summary"])

Amanda baked cookies and will bring Jerry some tomorrow.


### Data Preprocessing and Training Setup

In [None]:
# Testing the function on the first sample from train dataset

tokenized_example = convert_examples_to_features(dataset_samsum["train"][0])



In [None]:
print(tokenized_example["input_ids"])

[12195, 151, 125, 7091, 3659, 107, 842, 119, 245, 181, 152, 10508, 151, 7435, 147, 12195, 151, 125, 131, 267, 650, 119, 3469, 29344, 1]


In [None]:
print(tokenized_example["attention_mask"])

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [None]:
print(tokenized_example["labels"])

[12195, 7091, 3659, 111, 138, 650, 10508, 181, 3469, 107, 1]


In [None]:
# Decoding the tokenized sample without including the special tokens

decoded_dialogue = tokenizer.decode(tokenized_example["input_ids"], skip_special_tokens=True)
decoded_summary = tokenizer.decode(tokenized_example["labels"], skip_special_tokens=True)

print(f"Decoded Dialogue: {decoded_dialogue}")
print(f"Decoded Summary: {decoded_summary}")

Decoded Dialogue: Amanda: I baked cookies. Do you want some? Jerry: Sure! Amanda: I'll bring you tomorrow :-)
Decoded Summary: Amanda baked cookies and will bring Jerry some tomorrow.


In [None]:
# Decoding the tokenized sample including the special tokens

decoded_dialogue = tokenizer.decode(tokenized_example["input_ids"], skip_special_tokens=False)
decoded_summary = tokenizer.decode(tokenized_example["labels"], skip_special_tokens=False)

print(f"Decoded Dialogue: {decoded_dialogue}")
print(f"Decoded Summary: {decoded_summary}")

Decoded Dialogue: Amanda: I baked cookies. Do you want some? Jerry: Sure! Amanda: I'll bring you tomorrow :-)</s>
Decoded Summary: Amanda baked cookies and will bring Jerry some tomorrow.</s>


In [None]:
# We can observe that the length of the attention mask is same as the number of input tokens
# There was no padding and so the attention mask is all 1s

print(len(tokenized_example["input_ids"]))
print(len(tokenized_example["labels"]))
print(len(tokenized_example["attention_mask"]))

25
11
25


In [21]:
# Applying the convert_examples_to_features function on the dataset

dataset_samsum_pt = dataset_samsum.map(convert_examples_to_features, batched = True)

Map:   0%|          | 0/14732 [00:00<?, ? examples/s]



Map:   0%|          | 0/819 [00:00<?, ? examples/s]

Map:   0%|          | 0/818 [00:00<?, ? examples/s]

In [None]:
dataset_samsum_pt["train"]

Dataset({
    features: ['id', 'dialogue', 'summary', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 14732
})

In [22]:
seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_pegasus)

In [23]:
# Defining the training configuration

trainer_args = TrainingArguments(
    output_dir='pegasus-samsum',
    num_train_epochs=10,
    warmup_steps=200,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    weight_decay=0.01,
    logging_steps=10,
    evaluation_strategy='steps',
    eval_steps=50,
    save_steps=1e6,
    gradient_accumulation_steps=16
)

In [24]:
# Initializing the trainer object

trainer = Trainer(model=model_pegasus,
                  args=trainer_args,
                  tokenizer=tokenizer,
                  data_collator=seq2seq_data_collator,
                  train_dataset=dataset_samsum_pt["test"],
                  eval_dataset=dataset_samsum_pt["validation"])

### Model Training

In [25]:
# Train the model
trainer.train()

Step,Training Loss,Validation Loss
50,2.6226,2.072233
100,1.9911,1.747815
150,1.8527,1.630636
200,1.6269,1.580894
250,1.508,1.579892
300,1.4038,1.551296
350,1.3201,1.551876
400,1.2851,1.550625
450,1.1797,1.550855
500,1.1701,1.553786


TrainOutput(global_step=510, training_loss=1.6379867834203383, metrics={'train_runtime': 1954.2908, 'train_samples_per_second': 4.191, 'train_steps_per_second': 0.261, 'total_flos': 3126866083700736.0, 'train_loss': 1.6379867834203383, 'epoch': 9.963369963369964})

It seems the model starts to overfit after epoch 4 or step 200

### Model Export

In [59]:
# Save model

# model_pegasus.save_pretrained("pegasus-samsum-model")

Non-default generation parameters: {'max_length': 128, 'min_length': 32, 'num_beams': 8, 'length_penalty': 0.8, 'forced_eos_token_id': 1}


In [56]:
# Save tokenizer

# tokenizer.save_pretrained("tokenizer")

('tokenizer/tokenizer_config.json',
 'tokenizer/special_tokens_map.json',
 'tokenizer/spiece.model',
 'tokenizer/added_tokens.json',
 'tokenizer/tokenizer.json')

### Model Evaluation on Test Data

- column_text: This is an optional argument specifying the column name in the dataset containing the text to be summarized (default "article").

- column_summary: This is an optional argument specifying the column name in the dataset containing the reference summaries (default "highlights").

In [28]:
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
rouge_metric = load_metric('rouge')

  rouge_metric = load_metric('rouge')
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

- "rouge1": Refers to ROUGE-1 which considers the overlap of unigrams (single words) between the generated summary and the reference summaries.

- "rouge2": Refers to ROUGE-2 which considers the overlap of bigrams (sequences of two words) between the generated summary and the reference summaries.

- "rougeL": Refers to ROUGE-L which considers the longest common subsequence (LCS) of words between the generated summary and the reference summaries.

In [29]:
score = calculate_metric_on_test_ds(
    dataset_samsum['test'],
    rouge_metric,
    trainer.model,
    tokenizer,
    batch_size = 2,
    column_text = 'dialogue',
    column_summary= 'summary'
)

rouge_dict = dict((rn, score[rn].mid.fmeasure ) for rn in rouge_names )

pd.DataFrame(rouge_dict, index = [f'pegasus'] )

100%|██████████| 410/410 [13:30<00:00,  1.98s/it]


Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
pegasus,0.018436,0.000313,0.018401,0.018446


### Prediction

In [35]:
# Looking at specific example from the test dataset

test_example_number = 0
sample_text = dataset_samsum["test"][test_example_number]["dialogue"]
reference = dataset_samsum["test"][test_example_number]["summary"]

print("Input Dialogue:\n\n", sample_text)

print()

print("Summary:\n", reference)

Input Dialogue:

 Hannah: Hey, do you have Betty's number?
Amanda: Lemme check
Hannah: <file_gif>
Amanda: Sorry, can't find it.
Amanda: Ask Larry
Amanda: He called her last time we were at the park together
Hannah: I don't know him well
Hannah: <file_gif>
Amanda: Don't be shy, he's very nice
Hannah: If you say so..
Hannah: I'd rather you texted him
Amanda: Just text him 🙂
Hannah: Urgh.. Alright
Hannah: Bye
Amanda: Bye bye

Summary:
 Hannah needs Betty's number but Amanda doesn't have it. She needs to contact Larry.


In [32]:
# Looking at the length of input and target(summary) sequences for the specific example

print(len(sample_text))
print(len(reference))

459
54


In [38]:
# Looking at the length of input and target(summary) sequences and (summary/input) ratio for some examples

for i in range(10):
  test_example_number = i
  sample_text = dataset_samsum["test"][test_example_number]["dialogue"]
  reference = dataset_samsum["test"][test_example_number]["summary"]

  print("Input:", len(sample_text))
  print("Summary", len(reference))
  print("Ratio", len(reference)/len(sample_text))
  print("="*50)

Input: 407
Summary 83
Ratio 0.20393120393120392
Input: 459
Summary 54
Ratio 0.11764705882352941
Input: 592
Summary 150
Ratio 0.2533783783783784
Input: 461
Summary 50
Ratio 0.10845986984815618
Input: 1101
Summary 221
Ratio 0.20072661217075385
Input: 1559
Summary 300
Ratio 0.19243104554201412
Input: 1055
Summary 190
Ratio 0.18009478672985782
Input: 439
Summary 60
Ratio 0.1366742596810934
Input: 479
Summary 138
Ratio 0.2881002087682672
Input: 427
Summary 96
Ratio 0.22482435597189696


In [52]:
# Decided the multipliers based on the above observed ratios
min_length_multiplier = 0.10
max_length_multiplier = 0.25

In [53]:
# Checking the calculated min_length and max_length as per the multipliers

test_example_number = 0
sample_text = dataset_samsum["test"][test_example_number]["dialogue"]
reference = dataset_samsum["test"][test_example_number]["summary"]

print(len(sample_text)*min_length_multiplier)
print(len(sample_text)*max_length_multiplier)

print()

print(len(reference))

40.7
101.75

83


In [57]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("/content/tokenizer")

In [63]:
# Summarization parameter settings

test_example_number = 0
sample_text = dataset_samsum["test"][test_example_number]["dialogue"]
reference = dataset_samsum["test"][test_example_number]["summary"]

min_length = int(len(sample_text)*min_length_multiplier)
max_length = int(len(sample_text)*max_length_multiplier)

gen_kwargs = {"length_penalty": 0.8, "num_beams":8, "min_length": min_length, "max_length": max_length}

In [64]:
# Performing the prediction

pipe = pipeline("summarization", model="pegasus-samsum-model", tokenizer=tokenizer)

print("Dialogue:")
print(sample_text)

print("\nReference Summary:")
print(reference)

print("\nModel Summary:")
print(pipe(sample_text, **gen_kwargs)[0]["summary_text"])

Dialogue:
Hannah: Hey, do you have Betty's number?
Amanda: Lemme check
Hannah: <file_gif>
Amanda: Sorry, can't find it.
Amanda: Ask Larry
Amanda: He called her last time we were at the park together
Hannah: I don't know him well
Hannah: <file_gif>
Amanda: Don't be shy, he's very nice
Hannah: If you say so..
Hannah: I'd rather you texted him
Amanda: Just text him 🙂
Hannah: Urgh.. Alright
Hannah: Bye
Amanda: Bye bye

Reference Summary:
Hannah needs Betty's number but Amanda doesn't have it. She needs to contact Larry.

Model Summary:
Hannah is looking for Betty's number. Larry called her last time they were at the park together and she doesn't know him well. She wants Amanda to text him instead of calling him.
