In [2]:
!nvidia-smi

Sat Apr 27 20:16:16 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 545.23.08              Driver Version: 545.23.08    CUDA Version: 12.3     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A40                     On  | 00000000:01:00.0 Off |                    0 |
|  0%   58C    P0              88W / 300W |    786MiB / 46068MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA A40                     On  | 00000000:25:00.0 Off |  

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

from transformers import (
    pipeline,
    set_seed,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
  )

from datasets import (
    load_dataset,
    load_from_disk,
    load_metric
)

import nltk
from nltk.tokenize import sent_tokenize

cuda


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Get the pegasus model
pegasus = 'google/pegasus-cnn_dailymail'

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(pegasus)

# Load the model
model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(pegasus).to(device)

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
# Get the dataset
dataset = load_dataset("samsum")
print(dataset)

Downloading data: 100%|██████████| 6.06M/6.06M [00:00<00:00, 12.5MB/s]
Downloading data: 100%|██████████| 347k/347k [00:00<00:00, 2.56MB/s]
Downloading data: 100%|██████████| 335k/335k [00:00<00:00, 2.68MB/s]
Generating train split: 100%|██████████| 14732/14732 [00:02<00:00, 6658.69 examples/s]
Generating test split: 100%|██████████| 819/819 [00:00<00:00, 89995.68 examples/s]
Generating validation split: 100%|██████████| 818/818 [00:00<00:00, 28327.72 examples/s]


DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
})


In [6]:
def convert_examples_to_features(example_batch):
    input_encodings = tokenizer(example_batch['dialogue'] , max_length = 1024, truncation = True )
    
    with tokenizer.as_target_tokenizer():
        target_encodings = tokenizer(example_batch['summary'], max_length = 128, truncation = True )
        
    return {
        'input_ids' : input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'labels': target_encodings['input_ids']
    }

In [7]:
dataset_samsum_pt = dataset.map(convert_examples_to_features, batched = True)
print(dataset_samsum_pt['train'])

Map: 100%|██████████| 14732/14732 [00:03<00:00, 4491.50 examples/s]
Map: 100%|██████████| 819/819 [00:00<00:00, 5245.66 examples/s]
Map: 100%|██████████| 818/818 [00:00<00:00, 5707.72 examples/s]

Dataset({
    features: ['id', 'dialogue', 'summary', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 14732
})





In [8]:
# Training
from transformers import DataCollatorForSeq2Seq # responsible for creating batches of data

seq2seq_datacollator = DataCollatorForSeq2Seq(tokenizer, model=model_pegasus)

In [9]:
# Set training arguements
from transformers import TrainingArguments, Trainer

trainer_args = TrainingArguments(
    output_dir='pegasus-samsum', 
    num_train_epochs=1, 
    warmup_steps=500,
    per_device_train_batch_size=1, 
    per_device_eval_batch_size=1,
    weight_decay=0.01, 
    logging_steps=10,
    evaluation_strategy='steps', 
    eval_steps=500, 
    save_steps=1e6,
    gradient_accumulation_steps=16
) 

In [10]:
trainer = Trainer(
    model=model_pegasus, 
    args=trainer_args,
    tokenizer=tokenizer, 
    data_collator=seq2seq_datacollator,
    train_dataset=dataset_samsum_pt["test"], # train data is HUGE! Sticking to small since this notebook is only for the outline of the main project
    eval_dataset=dataset_samsum_pt["validation"]
)

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [11]:
# Train
trainer.train()



Step,Training Loss,Validation Loss


TrainOutput(global_step=6, training_loss=3.3213866551717124, metrics={'train_runtime': 108.0391, 'train_samples_per_second': 7.581, 'train_steps_per_second': 0.056, 'total_flos': 652477183033344.0, 'train_loss': 3.3213866551717124, 'epoch': 0.9320388349514563})

In [12]:
# Evaluation
def generate_batch_sized_chunks(list_of_elements, batch_size):
    """split the dataset into smaller batches that we can process simultaneously
    Yield successive batch-sized chunks from list_of_elements."""
    for i in range(0, len(list_of_elements), batch_size):
        yield list_of_elements[i : i + batch_size]

def calculate_metric_on_test_ds(
    dataset,
    metric, 
    model, 
    tokenizer,     
    batch_size=16, 
    device=device, 
    column_text="article", 
    column_summary="highlights"
    ):
    article_batches = list(generate_batch_sized_chunks(dataset[column_text], batch_size))
    target_batches = list(generate_batch_sized_chunks(dataset[column_summary], batch_size))

    for article_batch, target_batch in tqdm(
        zip(article_batches, target_batches), total=len(article_batches)):
        
        inputs = tokenizer(
            article_batch, 
            max_length=1024,  
            truncation=True, 
            padding="max_length", 
            return_tensors="pt"
            )
        
        summaries = model.generate(
            input_ids=inputs["input_ids"].to(device),
            attention_mask=inputs["attention_mask"].to(device), 
            length_penalty=0.8, 
            num_beams=8, 
            max_length=128
            )
        ''' parameter for length penalty ensures that the model does not generate sequences that are too long. '''
        
        # Finally, we decode the generated texts, 
        # replace the  token, and add the decoded texts with the references to the metric.
        decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True, 
                                clean_up_tokenization_spaces=True) 
               for s in summaries]      
        
        decoded_summaries = [d.replace("", " ") for d in decoded_summaries]
        
        
        metric.add_batch(predictions=decoded_summaries, references=target_batch)
        
    # Finally compute and return the ROUGE scores.
    score = metric.compute()
    return score

In [13]:
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
rouge_metric = load_metric('rouge')

  rouge_metric = load_metric('rouge')
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
Downloading builder script: 5.65kB [00:00, 10.9MB/s]                   


In [14]:
score = calculate_metric_on_test_ds(
    dataset['test'][0:10], rouge_metric, trainer.model, tokenizer, batch_size = 2, column_text = 'dialogue', column_summary= 'summary'
)

rouge_dict = dict((rn, score[rn].mid.fmeasure ) for rn in rouge_names )

pd.DataFrame(rouge_dict, index = [f'pegasus'] )

100%|██████████| 5/5 [00:11<00:00,  2.26s/it]


Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
pegasus,0.02023,0.0,0.020167,0.020183


In [15]:
# Save model
model_pegasus.save_pretrained("/Net/Groups/BGI/scratch/ppandey/Side_Quest/Text_Summarization_HF/experiments/pegasus-samsum-model")

Non-default generation parameters: {'max_length': 128, 'min_length': 32, 'num_beams': 8, 'length_penalty': 0.8, 'forced_eos_token_id': 1}


In [16]:
# Save tokenizer
tokenizer.save_pretrained("/Net/Groups/BGI/scratch/ppandey/Side_Quest/Text_Summarization_HF/experiments/tokenizer")

('/Net/Groups/BGI/scratch/ppandey/Side_Quest/Text_Summarization_HF/experiments/tokenizer/tokenizer_config.json',
 '/Net/Groups/BGI/scratch/ppandey/Side_Quest/Text_Summarization_HF/experiments/tokenizer/special_tokens_map.json',
 '/Net/Groups/BGI/scratch/ppandey/Side_Quest/Text_Summarization_HF/experiments/tokenizer/spiece.model',
 '/Net/Groups/BGI/scratch/ppandey/Side_Quest/Text_Summarization_HF/experiments/tokenizer/added_tokens.json',
 '/Net/Groups/BGI/scratch/ppandey/Side_Quest/Text_Summarization_HF/experiments/tokenizer/tokenizer.json')

In [18]:
#Load
tokenizer = AutoTokenizer.from_pretrained("/Net/Groups/BGI/scratch/ppandey/Side_Quest/Text_Summarization_HF/experiments/tokenizer")

In [19]:
#Prediction
gen_kwargs = {"length_penalty": 0.8, "num_beams":8, "max_length": 128}
sample_text = dataset["test"][0]["dialogue"]
reference = dataset["test"][0]["summary"]

pipe = pipeline(
    "summarization", 
    model="/Net/Groups/BGI/scratch/ppandey/Side_Quest/Text_Summarization_HF/experiments/pegasus-samsum-model",
    tokenizer=tokenizer
)

print("Dialogue:")
print(sample_text)

print("\nReference Summary:")
print(reference)

print("\nModel Summary:")
print(pipe(sample_text, **gen_kwargs)[0]["summary_text"])

Your max_length is set to 128, but your input_length is only 122. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=61)


Dialogue:
Hannah: Hey, do you have Betty's number?
Amanda: Lemme check
Hannah: <file_gif>
Amanda: Sorry, can't find it.
Amanda: Ask Larry
Amanda: He called her last time we were at the park together
Hannah: I don't know him well
Hannah: <file_gif>
Amanda: Don't be shy, he's very nice
Hannah: If you say so..
Hannah: I'd rather you texted him
Amanda: Just text him 🙂
Hannah: Urgh.. Alright
Hannah: Bye
Amanda: Bye bye

Reference Summary:
Hannah needs Betty's number but Amanda doesn't have it. She needs to contact Larry.

Model Summary:
Amanda: Ask Larry Amanda: He called her last time we were at the park together .<n>Hannah: I'd rather you texted him .<n>Amanda: Just text him .
