In [1]:
from datasets import load_dataset
from datasets import load_metric
from transformers import pipeline
import nltk
from nltk.tokenize import sent_tokenize
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

## Load the pubmed summarization dataset

In [3]:
dataset = load_dataset("ccdv/pubmed-summarization")

Downloading builder script: 100%|██████████████████████████████████████████████████████████████| 5.13k/5.13k [00:00<00:00, 1.93MB/s]
Downloading readme: 100%|███████████████████████████████████████████████████████████████████████| 2.66k/2.66k [00:00<00:00, 787kB/s]
Downloading data: 100%|██████████████████████████████████████████████████████████████████████████| 779M/779M [00:11<00:00, 67.6MB/s]
Downloading data: 100%|████████████████████████████████████████████████████████████████████████| 43.7M/43.7M [00:00<00:00, 60.2MB/s]
Downloading data: 100%|████████████████████████████████████████████████████████████████████████| 43.8M/43.8M [00:00<00:00, 60.0MB/s]
Generating train split: 119924 examples [00:15, 7839.43 examples/s]
Generating validation split: 6633 examples [00:00, 7462.56 examples/s]
Generating test split: 6658 examples [00:00, 7775.55 examples/s]


In [4]:
print(f"Features: {dataset['train'].column_names}")

Features: ['article', 'abstract']


In [5]:
sample = dataset['train'][1]
print(f"""
Article (excerpt of 500 characters, total length: {len(sample["article"])}):
""")
print(sample['article'][:500])
print(f'\nSummary (length: {len(sample["abstract"])}):')
print(sample["abstract"])


Article (excerpt of 500 characters, total length: 18281):

it occurs in more than 50% of patients and may reach 90% in certain types of cancers , especially in patients undergoing chemotherapy and/or radiation therapy.1 anemia is defined as an inadequate circulating level of hemoglobin ( hb ) ( hb < 12 g / dl ) and may arise as a result of the underlying disease , bleeding , poor nutrition , chemotherapy , or radiation therapy . 
 preliminary studies suggest that survival and loco - regional control after radiation therapy , especially in head and neck 

Summary (length: 2010):
backgroundanemia in patients with cancer who are undergoing active therapy is commonly encountered and may worsen quality of life in these patients . the effect of blood transfusion is often temporary and may be associated with serious adverse events . 
 erythropoiesis - stimulating agents are not effective in 30%50% of patients and may have a negative effect on overall survival.aimsto assess the efficacy and f

## Load the pretrained models

We could also load these models using `AutoModel` and `AutoTokenizer` to find the maximum input length. It would look something like:

```python
from transformers import AutoModel, AutoTokenizer

# Load the model and tokenizer
model_name = 'Falconsai/medical_summarization'
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Get the maximum input size
max_input_size = model.config.max_position_embeddings
print(f"The maximum input size of the model is: {max_input_size}")
```

In [6]:
summarizer_falconsai = pipeline("summarization", model="Falconsai/medical_summarization")

config.json: 100%|█████████████████████████████████████████████████████████████████████████████| 1.50k/1.50k [00:00<00:00, 61.5kB/s]
model.safetensors: 100%|█████████████████████████████████████████████████████████████████████████| 242M/242M [00:03<00:00, 62.4MB/s]
generation_config.json: 100%|██████████████████████████████████████████████████████████████████████| 112/112 [00:00<00:00, 5.17kB/s]
tokenizer_config.json: 100%|████████████████████████████████████████████████████████████████████| 2.37k/2.37k [00:00<00:00, 106kB/s]
spiece.model: 100%|██████████████████████████████████████████████████████████████████████████████| 792k/792k [00:00<00:00, 39.8MB/s]
tokenizer.json: 100%|██████████████████████████████████████████████████████████████████████████| 2.42M/2.42M [00:00<00:00, 16.7MB/s]
special_tokens_map.json: 100%|█████████████████████████████████████████████████████████████████| 2.20k/2.20k [00:00<00:00, 1.06MB/s]


In [7]:
summarizer_longt5 = pipeline("summarization", model="Stancld/longt5-tglobal-large-16384-pubmed-3k_steps")

config.json: 100%|██████████████████████████████████████████████████████████████████████████████████| 853/853 [00:00<00:00, 397kB/s]
model.safetensors: 100%|███████████████████████████████████████████████████████████████████████| 3.13G/3.13G [00:46<00:00, 66.8MB/s]
tokenizer_config.json: 100%|████████████████████████████████████████████████████████████████████| 2.34k/2.34k [00:00<00:00, 119kB/s]
tokenizer.json: 100%|██████████████████████████████████████████████████████████████████████████| 2.42M/2.42M [00:00<00:00, 2.64MB/s]
special_tokens_map.json: 100%|██████████████████████████████████████████████████████████████████| 2.20k/2.20k [00:00<00:00, 884kB/s]


## Sample text comparison

Here we'll take the first 3000 characters in order to have the same input for each of the models

In [8]:
sample_text = sample['article'][:5000]
summaries = {}

In [9]:
sample_text

'it occurs in more than 50% of patients and may reach 90% in certain types of cancers , especially in patients undergoing chemotherapy and/or radiation therapy.1 anemia is defined as an inadequate circulating level of hemoglobin ( hb ) ( hb < 12 g / dl ) and may arise as a result of the underlying disease , bleeding , poor nutrition , chemotherapy , or radiation therapy . \n preliminary studies suggest that survival and loco - regional control after radiation therapy , especially in head and neck cancers , may be compromised by anemia.24 anemia often worsens symptoms such as fatigue , weakness , and dyspnea , and thus may have a negative effect on quality of life ( qol ) and performance status in patients with cancer . \n thus , to improve physical functioning , qol , and prognosis in patients with cancer , it would be reasonable to take a proactive approach in identifying populations who need treatment for cancer - associated anemia ( caa ) and provide timely management . \n blood tra

## Evaluate pretrained models

As a baseline we will use the first three sentences as the summary

In [10]:
def three_sentence_summary(text):
    return "\n".join(sent_tokenize(text)[:3])

#### Baseline

In [11]:
summaries["baseline"] = three_sentence_summary(sample_text)

In [12]:
summaries["baseline"]

'it occurs in more than 50% of patients and may reach 90% in certain types of cancers , especially in patients undergoing chemotherapy and/or radiation therapy.1 anemia is defined as an inadequate circulating level of hemoglobin ( hb ) ( hb < 12 g / dl ) and may arise as a result of the underlying disease , bleeding , poor nutrition , chemotherapy , or radiation therapy .\npreliminary studies suggest that survival and loco - regional control after radiation therapy , especially in head and neck cancers , may be compromised by anemia.24 anemia often worsens symptoms such as fatigue , weakness , and dyspnea , and thus may have a negative effect on quality of life ( qol ) and performance status in patients with cancer .\nthus , to improve physical functioning , qol , and prognosis in patients with cancer , it would be reasonable to take a proactive approach in identifying populations who need treatment for cancer - associated anemia ( caa ) and provide timely management .'

#### Falconsai

In [13]:
output = summarizer_falconsai(sample_text)

Token indices sequence length is longer than the specified maximum sequence length for this model (1282 > 512). Running this sequence through the model will result in indexing errors


In [14]:
output

[{'summary_text': 'objectivethe aim of this study was to evaluate the efficacy and feasibility of intravenous ( iv ) iron monotherapy in patients with cancer who have anemia and who are undergoing treatment with chemotherapy and/or radiation therapy without the use of erythropoiesis - stimulating agents ( esas ) alone.materials and methodsthis pilot study was conducted to assess the effect of ivenous iron on quality of life ( qol ) , time to maximal response , and improvement in qiol parameters ( when measured ) in favor of the combination.resultsthe mean age of patients with caa were 18 years old , about to start a cycle of chemotherapy and radiation therapy within 1 week of inclusion . patients were treated for 12 weeks , followed by a 4-week follow - up period . the mean age ( p  '}]

In [15]:
summaries["falconsai"] = "\n".join(sent_tokenize(output[0]["summary_text"]))

#### long t5

In [16]:
output = summarizer_longt5(sample_text, max_length=200)



In [17]:
output

[{'summary_text': 'anemia is defined as an inadequate circulating level of hemoglobin ( hb ) ( hb  12 g / dl ) and may arise as a result of the underlying disease , bleeding , poor nutrition , chemotherapy , or radiation therapy . in randomized clinical trials in patients with cancer , erythropoiesis - stimulating agents produced significant increases in hb level , decreased transfusion requirements , and improved quality of life ( qol ) . however , 30%50% of patients do not respond to such agents . in addition , the use of erythropoietin stimulation in patients with cancer is partly attributed to the functional iron deficiency state , where the high rate of erythropoiesis exceeds the delivery of usable iron despite adequate iron stores . in this pilot'}]

In [18]:
summaries["longt5"] = "\n".join(sent_tokenize(output[0]["summary_text"]))

## Comparing different summaries

In [19]:
print("GROUND TRUTH")
print(sample['abstract'])
print("")

for model_name in summaries:
    print(model_name.upper())
    print(summaries[model_name])
    print("")

GROUND TRUTH
backgroundanemia in patients with cancer who are undergoing active therapy is commonly encountered and may worsen quality of life in these patients . the effect of blood transfusion is often temporary and may be associated with serious adverse events . 
 erythropoiesis - stimulating agents are not effective in 30%50% of patients and may have a negative effect on overall survival.aimsto assess the efficacy and feasibility of intravenous iron therapy in patients with cancer who have non - iron - deficiency anemia and who are undergoing treatment with chemotherapy without the use of erythropoiesis - stimulating agents.methodsadult patients with solid cancers and non - iron - deficiency anemia were included . 
 ferric sucrose at a dose of 200 mg was given in short intravenous infusions weekly for a total of 12 weeks . 
 hemoglobin level was measured at baseline , every 3 weeks , and 2 weeks after the last iron infusion ( week 14 ) . 
 adverse events related to intravenous iron

## Evaluate quality of summaries

We will use the ROUGE metric to compare the quality of different summaries

In [20]:
rouge_metric = load_metric("rouge")

  rouge_metric = load_metric("rouge")
Downloading builder script: 5.65kB [00:00, 1.20MB/s]                                                                                


In [21]:
reference = sample['abstract']
records = []
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]

for model_name in summaries:
    rouge_metric.add(prediction=summaries[model_name], reference=reference)
    score = rouge_metric.compute()
    rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names)
    records.append(rouge_dict)
pd.DataFrame.from_records(records, index=summaries.keys())

Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
baseline,0.315126,0.097046,0.138655,0.268908
falconsai,0.337029,0.151448,0.195122,0.310421
longt5,0.279817,0.110599,0.137615,0.238532


## Evaluating the models on the validation dataset

To keep the evaluation time reasonable we will choose a subset of 200 examples from the test dataset to compare the models

In [22]:
dataset

DatasetDict({
    train: Dataset({
        features: ['article', 'abstract'],
        num_rows: 119924
    })
    validation: Dataset({
        features: ['article', 'abstract'],
        num_rows: 6633
    })
    test: Dataset({
        features: ['article', 'abstract'],
        num_rows: 6658
    })
})

In [32]:
test_sampled = dataset["test"].shuffle(seed=42).select(range(1600))

#### Baseline

In [26]:
def evaluate_summaries_baseline(dataset, metric,
                               column_text='article',
                               column_summary='abstract'):
    summaries = [three_sentence_summary(text) for text in dataset[column_text]]
    metric.add_batch(predictions=summaries,
                     references=dataset[column_summary])
    score = metric.compute()
    return score

In [27]:
score = evaluate_summaries_baseline(test_sampled, rouge_metric)
rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names)
pd.DataFrame.from_dict(rouge_dict, orient="index", columns=["baseline"]).T

Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
baseline,0.2712,0.092727,0.171766,0.245244


#### falconsai

In [28]:
import torch

In [29]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [31]:
def chunks(list_of_elements, batch_size):
    for i in range(0, len(list_of_elements), batch_size):
        yield list_of_elements[i: i + batch_size]

In [42]:
def evaluate_summaries(dataset, metric, model, tokenizer,
                      batch_size=16, device=device,
                      column_text="article", column_summary="abstract"):
    article_batches = list(chunks(dataset[column_text], batch_size))
    target_batches = list(chunks(dataset[column_summary], batch_size))

    for i, (article_batch, target_batch) in enumerate(zip(article_batches, target_batches)):
        if i % 10 == 0:
            print(f'Running batch {i + 1} of {len(article_batches)}')

        inputs = tokenizer(article_batch, max_length=2048, truncation=True, padding="max_length", return_tensors="pt")
        summaries = model.generate(input_ids=inputs["input_ids"].to(device),
                                   attention_mask=inputs["attention_mask"].to(device),
                                   length_penalty=0.8, num_beams=8, max_length=256)

        decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True, clean_up_tokenization_space=True) for s in summaries]
        decoded_summaries = [d.replace("<n>", " ") for d in decoded_summaries]
        metric.add_batch(predictions=decoded_summaries, references=target_batch)

    score = metric.compute()
    return score

In [34]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

In [35]:
model_ckpt = "Falconsai/medical_summarization"

In [36]:
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [38]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)

In [43]:
score = evaluate_summaries(test_sampled, rouge_metric, model, tokenizer, batch_size=8)

Running batch 1 of 200
Running batch 11 of 200
Running batch 21 of 200
Running batch 31 of 200
Running batch 41 of 200
Running batch 51 of 200
Running batch 61 of 200
Running batch 71 of 200
Running batch 81 of 200
Running batch 91 of 200
Running batch 101 of 200
Running batch 111 of 200
Running batch 121 of 200
Running batch 131 of 200
Running batch 141 of 200
Running batch 151 of 200
Running batch 161 of 200
Running batch 171 of 200
Running batch 181 of 200
Running batch 191 of 200


In [44]:
rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names)
pd.DataFrame(rouge_dict, index=["falconsai"])

Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
falconsai,0.362385,0.145307,0.225378,0.308867
