## Environment Setup

In [None]:
# Checking the GPU availability
!nvidia-smi

Sat Apr 20 05:58:24 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   46C    P8              10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

We are using T4 GPU (Tesla 4 GPU)

In [None]:
# Installing some packages
!pip install transformers[sentencepiece] datasets sacrebleu rouge_score py7zr -q

In [None]:
# Uninstalling and installing some packages

!pip uninstall -y transformers accelerate
!pip install transformers accelerate

Successfully installed accelerate-0.29.3 nvidia-cublas-cu12-12.1.3.1 nvidia-cuda-cupti-cu12-12.1.105 nvidia-cuda-nvrtc-cu12-12.1.105 nvidia-cuda-runtime-cu12-12.1.105 nvidia-cudnn-cu12-8.9.2.26 nvidia-cufft-cu12-11.0.2.54 nvidia-curand-cu12-10.3.2.106 nvidia-cusolver-cu12-11.4.5.107 nvidia-cusparse-cu12-12.1.0.106 nvidia-nccl-cu12-2.19.3 nvidia-nvjitlink-cu12-12.4.127 nvidia-nvtx-cu12-12.1.105
Found existing installation: transformers 4.38.2
Uninstalling transformers-4.38.2:
  Successfully uninstalled transformers-4.38.2
Found existing installation: accelerate 0.29.3
Uninstalling accelerate-0.29.3:
  Successfully uninstalled accelerate-0.29.3
Collecting transformers
  Downloading transformers-4.40.0-py3-none-any.whl (9.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.0/9.0 MB[0m [31m28.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Using cached accelerate-0.29.3-py3-none-any.whl (297 kB)
Collecting tokenizers<0.20,>=0.19 (from transformers)
  Dow

## Model Fine Tuning

### Imports and Setup

In [1]:
import pandas as pd
import torch

from datasets import load_dataset, load_from_disk, load_metric

from transformers import (AutoModelForSeq2SeqLM,
                          AutoTokenizer,
                          DataCollatorForSeq2Seq,
                          pipeline,
                          TrainingArguments,
                          Trainer)

from tqdm import tqdm

# Custom utility module
from utility.utils import *




In [None]:
# Setting device to GPU if available else CPU (with PyTorch)
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [None]:
model_ckpt = "google/pegasus-cnn_dailymail"

tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)

### Data Import and Analysis

In [None]:
# Download and unzip the dataset for fine tuning
!wget https://github.com/sg13041995/Datasets/raw/main/textSummarizer_samsun.zip
!unzip textSummarizer_samsun.zip

--2024-04-20 06:00:22--  https://github.com/sg13041995/Datasets/raw/main/textSummarizer_samsun.zip
Resolving github.com (github.com)... 140.82.112.4
Connecting to github.com (github.com)|140.82.112.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/sg13041995/Datasets/main/textSummarizer_samsun.zip [following]
--2024-04-20 06:00:23--  https://raw.githubusercontent.com/sg13041995/Datasets/main/textSummarizer_samsun.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7903594 (7.5M) [application/zip]
Saving to: ‘textSummarizer_samsun.zip’


2024-04-20 06:00:23 (130 MB/s) - ‘textSummarizer_samsun.zip’ saved [7903594/7903594]

Archive:  textSummarizer_samsun.zip
  inflating: samsum-test.csv         


In [None]:
# Loading the dataset from disk and explore
dataset_samsum = load_from_disk('samsum_dataset')
dataset_samsum

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
})

### Data Preprocessing and Training Setup

In [None]:
# Applying the convert_examples_to_features function on the dataset
dataset_samsum_pt = dataset_samsum.map(convert_examples_to_features, batched = True)

Map:   0%|          | 0/14732 [00:00<?, ? examples/s]



Map:   0%|          | 0/819 [00:00<?, ? examples/s]

Map:   0%|          | 0/818 [00:00<?, ? examples/s]

In [None]:
seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_pegasus)

In [None]:
# Defining the training configuration
trainer_args = TrainingArguments(
    output_dir='pegasus-samsum',
    num_train_epochs=4,
    warmup_steps=200,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    weight_decay=0.01,
    logging_steps=10,
    evaluation_strategy='steps',
    eval_steps=50,
    save_steps=1e6,
    gradient_accumulation_steps=16
)

In [None]:
# Initializing the trainer object
trainer = Trainer(model=model_pegasus,
                  args=trainer_args,
                  tokenizer=tokenizer,
                  data_collator=seq2seq_data_collator,
                  train_dataset=dataset_samsum_pt["test"],
                  eval_dataset=dataset_samsum_pt["validation"])

### Model Training

In [None]:
# Train the model with 4 epochs based on the previous observation of overfitting after epoch 4
trainer.train()

Step,Training Loss,Validation Loss
50,2.6226,2.072233
100,1.9911,1.747815
150,1.8527,1.630636
200,1.6269,1.580894


TrainOutput(global_step=204, training_loss=2.138618700644549, metrics={'train_runtime': 753.9962, 'train_samples_per_second': 4.345, 'train_steps_per_second': 0.271, 'total_flos': 1252252679675904.0, 'train_loss': 2.138618700644549, 'epoch': 3.9853479853479854})

- We observed overfitting after 4 epochs so we tried to stop the training at epoch 4
- But the model is not really working well with a training till 4 epochs

### Model Export

In [None]:
# Save model

# model_pegasus.save_pretrained("pegasus-samsum-model")

Non-default generation parameters: {'max_length': 128, 'min_length': 32, 'num_beams': 8, 'length_penalty': 0.8, 'forced_eos_token_id': 1}


In [None]:
# Save tokenizer

# tokenizer.save_pretrained("tokenizer")

('tokenizer/tokenizer_config.json',
 'tokenizer/special_tokens_map.json',
 'tokenizer/spiece.model',
 'tokenizer/added_tokens.json',
 'tokenizer/tokenizer.json')

### Model Evaluation on Test Data

In [None]:
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
rouge_metric = load_metric('rouge')

  rouge_metric = load_metric('rouge')
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

In [None]:
score = calculate_metric_on_test_ds(
    dataset_samsum['test'],
    rouge_metric,
    trainer.model,
    tokenizer,
    batch_size = 2,
    column_text = 'dialogue',
    column_summary= 'summary'
)

rouge_dict = dict((rn, score[rn].mid.fmeasure ) for rn in rouge_names )

pd.DataFrame(rouge_dict, index = [f'pegasus'] )

100%|██████████| 410/410 [13:00<00:00,  1.90s/it]


Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
pegasus,0.018343,0.000285,0.018281,0.018289


### Prediction

In [None]:
# Looking at specific example from the test dataset

test_example_number = 0
sample_text = dataset_samsum["test"][test_example_number]["dialogue"]
reference = dataset_samsum["test"][test_example_number]["summary"]

print("Input Dialogue:\n\n", sample_text)

print()

print("Summary:\n", reference)

Input Dialogue:

 Hannah: Hey, do you have Betty's number?
Amanda: Lemme check
Hannah: <file_gif>
Amanda: Sorry, can't find it.
Amanda: Ask Larry
Amanda: He called her last time we were at the park together
Hannah: I don't know him well
Hannah: <file_gif>
Amanda: Don't be shy, he's very nice
Hannah: If you say so..
Hannah: I'd rather you texted him
Amanda: Just text him 🙂
Hannah: Urgh.. Alright
Hannah: Bye
Amanda: Bye bye

Summary:
 Hannah needs Betty's number but Amanda doesn't have it. She needs to contact Larry.


In [None]:
# Decided the multipliers based on the above observed ratios
min_length_multiplier = 0.10
max_length_multiplier = 0.25

In [None]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("/content/tokenizer")

In [None]:
# Summarization parameter settings

test_example_number = 0
sample_text = dataset_samsum["test"][test_example_number]["dialogue"]
reference = dataset_samsum["test"][test_example_number]["summary"]

min_length = int(len(sample_text)*min_length_multiplier)
max_length = int(len(sample_text)*max_length_multiplier)

gen_kwargs = {"length_penalty": 0.8, "num_beams":8, "min_length": min_length, "max_length": max_length}

In [None]:
pipe = pipeline("summarization", model="pegasus-samsum-model", tokenizer=tokenizer)

print("Dialogue:")
print(sample_text)

print("\nReference Summary:")
print(reference)

print("\nModel Summary:")
print(pipe(sample_text, **gen_kwargs)[0]["summary_text"])

Dialogue:
Hannah: Hey, do you have Betty's number?
Amanda: Lemme check
Hannah: <file_gif>
Amanda: Sorry, can't find it.
Amanda: Ask Larry
Amanda: He called her last time we were at the park together
Hannah: I don't know him well
Hannah: <file_gif>
Amanda: Don't be shy, he's very nice
Hannah: If you say so..
Hannah: I'd rather you texted him
Amanda: Just text him 🙂
Hannah: Urgh.. Alright
Hannah: Bye
Amanda: Bye bye

Reference Summary:
Hannah needs Betty's number but Amanda doesn't have it. She needs to contact Larry.

Model Summary:
Betty's number is Larry's. He called her last time they were at the park together. He's very nice. Hannah would rather she text him instead of finding Betty's number.


The summary looks distorted. Not really good.