In [None]:
from datasets import load_dataset
from datasets import load_metric
from transformers import pipeline
import nltk
from nltk.tokenize import sent_tokenize
import pandas as pd

In [None]:
nltk.download('punkt')

## Load the pubmed summarization dataset

In [None]:
dataset = load_dataset("ccdv/pubmed-summarization")

In [None]:
print(f"Features: {dataset['train'].column_names}")

In [None]:
sample = dataset['train'][1]
print(f"""
Article (excerpt of 500 characters, total length: {len(sample["article"])}):
""")
print(sample['article'][:500])
print(f'\nSummary (length: {len(sample["abstract"])}):')
print(sample["abstract"])

## Load the pretrained models

We could also load these models using `AutoModel` and `AutoTokenizer` to find the maximum input length. It would look something like:

```python
from transformers import AutoModel, AutoTokenizer

# Load the model and tokenizer
model_name = 'Falconsai/medical_summarization'
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Get the maximum input size
max_input_size = model.config.max_position_embeddings
print(f"The maximum input size of the model is: {max_input_size}")
```

In [None]:
summarizer_falconsai = pipeline("summarization", model="Falconsai/medical_summarization")

In [None]:
summarizer_longt5 = pipeline("summarization", model="Stancld/longt5-tglobal-large-16384-pubmed-3k_steps")

## Sample text comparison

Here we'll take the first 3000 characters in order to have the same input for each of the models

In [None]:
sample_text = sample['article'][:3000]
summaries = {}

In [None]:
sample_text

## Evaluate pretrained models

As a baseline we will use the first three sentences as the summary

In [None]:
def three_sentence_summary(text):
    return "\n".join(sent_tokenize(text)[:3])

#### Baseline

In [None]:
summaries["baseline"] = three_sentence_summary(sample_text)

In [None]:
summaries["baseline"]

#### Falconsai

In [None]:
output = summarizer_falconsai(sample_text)

In [None]:
output

In [None]:
summaries["falconsai"] = "\n".join(sent_tokenize(output[0]["summary_text"]))

#### long t5

In [None]:
output = summarizer_longt5(sample_text, max_length=200)

In [None]:
output

In [None]:
summaries["longt5"] = "\n".join(sent_tokenize(output[0]["summary_text"]))

## Comparing different summaries

In [None]:
print("GROUND TRUTH")
print(sample['abstract'])
print("")

for model_name in summaries:
    print(model_name.upper())
    print(summaries[model_name])
    print("")

## Evaluate quality of summaries

We will use the ROUGE metric to compare the quality of different summaries

In [None]:
rouge_metric = load_metric("rouge")

In [None]:
reference = sample['abstract']
records = []
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]

for model_name in summaries:
    rouge_metric.add(prediction=summaries[model_name], reference=reference)
    score = rouge_metric.compute()
    rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names)
    records.append(rouge_dict)
pd.DataFrame.from_records(records, index=summaries.keys())