In [1]:
!pip install git+https://github.com/google-research/bleurt.git -q
!pip install bert_score rouge-score evaluate -q

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for BLEURT (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone


In [2]:
import os
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

torch.manual_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#initializing model and tokenizer
model_name = "google/pegasus-xsum"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/87.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.52M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/259 [00:00<?, ?B/s]

## Preprcoessing dataset to find metrics

In [5]:
def preprocess_dataset(path):
    """
    Preprocesses the dataset by reading all the csv files in the given path and
    converts it into a list of dictionaries with the input text and the summary text.
    """
    csv_files = [os.path.join(path, file) for file in os.listdir(path) if file.endswith(".csv")]
    data = []

    for file in csv_files:
        df = pd.read_csv(file)

        #extract the summary
        df['Utterance_cleaned'] = df['Utterance'].str.lower().str.strip() # to handle "summary " and "Summary"
        summary_row = df[df["Utterance_cleaned"] == "summary"]
        summary_text = summary_row.iloc[0, 1] if not summary_row.empty else ""

        #filter out rows that are not actual utterances
        dialogue_df = df[~df["Utterance_cleaned"].isin(["summary", "primary_topic", "secondary_topic"])]

        #drop inactive utterances
        dialogue_df = dialogue_df[dialogue_df['Sub topic'] != 'inactive']

        #concatenate utterances and format input
        full_dialogue = " ".join(dialogue_df["Utterance"].dropna())

        input_text = f"summarize: {full_dialogue}"
        data.append({"input": input_text, "summary": summary_text})

    return data

In [6]:
train_data = preprocess_dataset("/kaggle/input/dataset/Train")
val_data = preprocess_dataset("/kaggle/input/dataset/Validation")
test_data = preprocess_dataset("/kaggle/input/dataset/Test")

## Fine-tuning the model

In [5]:
from transformers import TrainingArguments, Trainer
from datasets import Dataset, DatasetDict
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

#making hugging face dataset instance to fine tune with trainer api
train_dataset = Dataset.from_list(train_data)
val_dataset = Dataset.from_list(val_data)
test_dataset = Dataset.from_list(test_data)

#create a dataset dictionary
dataset = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset
})

In [6]:
def preprocess_function(examples):
    """
    Preprocesses the dataset for fine tuning the model.
    """
    model_inputs = tokenizer(examples["input"], padding="longest", truncation=True)
    labels = tokenizer(examples["summary"], padding="max_length", truncation=True, max_length=150)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/131 [00:00<?, ? examples/s]

Map:   0%|          | 0/21 [00:00<?, ? examples/s]

Map:   0%|          | 0/39 [00:00<?, ? examples/s]

In [7]:
#fine tuning the model with trainer api and save the model
training_args = TrainingArguments(
    output_dir="./pegasus-finetuned",
    eval_strategy="epoch",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    learning_rate=1e-5,
    num_train_epochs=20,
    weight_decay=0.01,
    warmup_steps=500,
    save_strategy="epoch",
    save_total_limit=1,
    logging_dir="./logs",
    logging_steps=10,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"]
)

trainer.train()

model.save_pretrained("./pegasus-finetuned")
tokenizer.save_pretrained("./pegasus-finetuned")

Epoch,Training Loss,Validation Loss
1,6.6529,5.334397
2,6.0189,5.293077
3,5.5647,5.227047
4,5.6576,5.149861
5,5.2292,5.047176
6,5.8572,4.943526
7,4.6775,4.855015
8,5.1765,4.782526
9,4.9918,4.707219
10,4.5362,4.649623


('./pegasus-finetuned/tokenizer_config.json',
 './pegasus-finetuned/special_tokens_map.json',
 './pegasus-finetuned/spiece.model',
 './pegasus-finetuned/added_tokens.json',
 './pegasus-finetuned/tokenizer.json')

## Summarizing the texts

In [7]:
tokenizer = AutoTokenizer.from_pretrained("/kaggle/input/pegasus-finetuned")
model = AutoModelForSeq2SeqLM.from_pretrained("/kaggle/input/pegasus-finetuned")
model.to(device)

for item in test_data:
    input_text = item["input"]

    #tokenize input
    input_ids = tokenizer(input_text, padding="longest", return_tensors="pt", truncation=True).input_ids.to(device)

    #generate summary
    summary_ids = model.generate(input_ids, max_length=150, num_beams=8, repetition_penalty=5.0, early_stopping=True)
    generated_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    #store the generated summary
    item["generated_summary"] = generated_summary

## Calculating BLEU score and BERT score on test set

In [8]:
references = [] #list to store target summaries
predictions = [] #list to store generated summaries

for item in test_data:
    references.append(item["summary"])  #ground truth summaries
    predictions.append(item["generated_summary"])

In [9]:
from bert_score import score
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import evaluate
from rouge_score import rouge_scorer


#Rogue score
def compute_rouge(predictions, references):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}
    for pred, ref in zip(predictions, references):
        score = scorer.score(ref, pred)
        scores['rouge1'].append(score['rouge1'].fmeasure)
        scores['rouge2'].append(score['rouge2'].fmeasure)
        scores['rougeL'].append(score['rougeL'].fmeasure)
    return {key: sum(val)/len(val) for key, val in scores.items()}  # Averaging scores

rouge_scores = compute_rouge(predictions, references)

#BLEURT score
bleurt = evaluate.load("bleurt", config_name="bleurt-base-128")
results = bleurt.compute(predictions=predictions, references=references)
avg_bleurt = sum(results["scores"]) / len(results["scores"])

#BLEU score
smoothie = SmoothingFunction().method4
bleu_scores = [sentence_bleu(ref, pred.split(), smoothing_function=smoothie) for ref, pred in zip(references, predictions)]
avg_bleu = sum(bleu_scores) / len(bleu_scores)

#BERT score
P, R, F1 = score(predictions, [ref[0] for ref in references], lang="en")

Downloading builder script:   0%|          | 0.00/5.20k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/405M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
print(f"Rogue-1 Score: {rouge_scores['rouge1'] * 100:.2f}")
print(f"Rogue-2 Score: {rouge_scores['rouge2'] * 100:.2f}")
print(f"Rogue-L Score: {rouge_scores['rougeL'] * 100:.2f}")
print(f"BLEURT Score: {avg_bleurt:.4f}")
print()
print(f"BLEU score: {avg_bleu * 100:.2f}")
print(f"BERT score F1: {F1.mean().item() * 100:.2f}")
print(f"BERT score Precision: {P.mean().item() * 100:.2f}")
print(f"BERT score Recall: {R.mean().item() * 100:.2f}")

Rogue-1 Score: 29.07
Rogue-2 Score: 9.09
Rogue-L Score: 18.39
BLEURT Score: -0.7210

BLEU score: 11.39
BERT score F1: 79.81
BERT score Precision: 77.95
BERT score Recall: 81.78


In [11]:
print('Original summary:', references[1])
print()
print('Generated summary:', predictions[1])

Original summary: The therapist examines the abdomen and other parts of the body. The therapist pulls down the eyelids and checks for anemia, then for scar signs. The therapist requests to perform shifting dullness test, full lymph retinopathy screen including accelerate and inguinal lymph nodes.

Generated summary: The patient has a few other parts of their body that they would like to examine.
