In [195]:
from typing import Dict, Tuple
import nltk

from tqdm import tqdm

import pandas as pd
from sklearn.model_selection import train_test_split

import numpy as np
from datasets import Dataset
import evaluate
import torch
import torch.nn as nn
from torch.optim import AdamW
from transformers import Trainer, T5Tokenizer, T5ForConditionalGeneration, TrainingArguments
from transformers import StoppingCriteria, StoppingCriteriaList

In [2]:
MAX_LENGTH = 216

In [3]:
tokenizer = T5Tokenizer.from_pretrained('t5-base')
model = T5ForConditionalGeneration.from_pretrained('t5-base')

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
def summarize_text(text):
    # Preprocess the text
    inputs = tokenizer.encode(
        "summarize: " + text,
        return_tensors='pt',
        max_length=MAX_LENGTH,
        truncation=True
    )
 
    # Generate the summary
    summary_ids = model.generate(
        inputs,
    )
 
    # Decode and return the summary
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

In [5]:
input_sentence = "Diabetes, also known as diabetes mellitus, is a disease in which your blood glucose, or blood sugar, levels are too high. Glucose is your body's main source of energy. Your body can make glucose, but it also comes from the food you eat. Insulin is a hormone made by your pancreas. Insulin helps move glucose from your bloodstream into your cells, where it can be used for energy.If you have diabetes, your body can't make insulin, can't use insulin as well as it should, or both. Too much glucose stays in your blood and doesn't reach your cells. This can cause glucose levels to get too high. Over time, high blood glucose levels can lead to serious health conditions. But you can take steps to manage your diabetes and try to prevent these health problems."
print(summarize_text(input_sentence))



diabetes is also known as diabetes mellitus. too much glucose stays in your blood


In [6]:
wikismall_train_df = pd.read_csv("datasets/wikismall/train.csv")
wikismall_val_df = pd.read_csv("datasets/wikismall/val.csv")

wikismall_train = Dataset.from_pandas(wikismall_train_df)
wikismall_val = Dataset.from_pandas(wikismall_val_df)

print(wikismall_train)
print(wikismall_val)

Dataset({
    features: ['source', 'target'],
    num_rows: 88836
})
Dataset({
    features: ['source', 'target'],
    num_rows: 205
})


In [7]:
MIMIC_DATA_USAGE_ROWS = 25000
MIMIC_VAL_SIZE = 0.3
MIMIC_TEST_SIZE = 0.5
RANDOM_STATE = 42

mimic_df = pd.read_csv("datasets/mimic-iv-ext-bhc/mimic-iv-bhc.csv", nrows=MIMIC_DATA_USAGE_ROWS)
mimic_df.rename(columns={'input': 'source'}, inplace=True)

mimic_train_df, mimic_rest_df = train_test_split(mimic_df, test_size=MIMIC_VAL_SIZE, random_state=RANDOM_STATE)
mimic_val_df, mimic_test_df = train_test_split(mimic_rest_df, test_size=MIMIC_TEST_SIZE, random_state=RANDOM_STATE)

mimic_train = Dataset.from_pandas(mimic_train_df)
mimic_val = Dataset.from_pandas(mimic_val_df)
mimic_test = Dataset.from_pandas(mimic_test_df)

print(mimic_train)
print(mimic_val)
print(mimic_test)

Dataset({
    features: ['note_id', 'source', 'target', 'input_tokens', 'target_tokens', '__index_level_0__'],
    num_rows: 17500
})
Dataset({
    features: ['note_id', 'source', 'target', 'input_tokens', 'target_tokens', '__index_level_0__'],
    num_rows: 3750
})
Dataset({
    features: ['note_id', 'source', 'target', 'input_tokens', 'target_tokens', '__index_level_0__'],
    num_rows: 3750
})


In [8]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device {DEVICE}")

Using device cuda


In [9]:
MAX_LENGTH_WIKISMALL = 256
def preprocess(dataset):
    sources = [f"summarize: {source}" for source in dataset['source']]
    tokens = tokenizer(sources, max_length=MAX_LENGTH_WIKISMALL, truncation=True, padding='max_length')
 
    targets = [target for target in dataset['target']]
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=MAX_LENGTH_WIKISMALL, truncation=True, padding='max_length')
 
    tokens["labels"] = labels["input_ids"]
    return tokens
 
tokenized_train_wikismall = wikismall_train.map(preprocess, batched=True)
tokenized_valid_wikismall = wikismall_val.map(preprocess, batched=True)

Map:   0%|          | 0/88836 [00:00<?, ? examples/s]



Map:   0%|          | 0/205 [00:00<?, ? examples/s]

In [10]:
MAX_LENGTH_MIMIC = 512
def preprocess(dataset):
    sources = [f"summarize: {source}" for source in dataset['source']]
    tokens = tokenizer(sources, max_length=MAX_LENGTH_MIMIC, truncation=True, padding='max_length')
 
    targets = [target for target in dataset['target']]
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=MAX_LENGTH_MIMIC, truncation=True, padding='max_length')
 
    tokens["labels"] = labels["input_ids"]
    return tokens

tokenized_mimic_train = mimic_train.map(preprocess, batched=True)
tokenized_mimic_val = mimic_val.map(preprocess, batched=True)
tokenized_mimic_test = mimic_test.map(preprocess, batched=True)

Map:   0%|          | 0/17500 [00:00<?, ? examples/s]

Map:   0%|          | 0/3750 [00:00<?, ? examples/s]

Map:   0%|          | 0/3750 [00:00<?, ? examples/s]

In [11]:
model.to(DEVICE)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [12]:
# https://github.com/huggingface/transformers/blob/main/examples/pytorch/summarization/run_summarization.py

metric = evaluate.load("rouge")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

    return preds, labels
 
def compute_metrics(eval_preds):
        preds, labels = eval_preds
        if isinstance(preds, tuple):
            preds = preds[0]
        # Replace -100s used for padding as we can't decode them
        preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
        decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

        # Some simple post-processing
        decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

        result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
        result = {k: round(v * 100, 4) for k, v in result.items()}
        prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
        result["gen_len"] = np.mean(prediction_lens)
        return result


In [13]:
# Memory leak bug fix: 
# Source: https://discuss.huggingface.co/t/cuda-out-of-memory-when-using-trainer-with-compute-metrics/2941
def preprocess_logits_for_metrics(logits, labels):
    """
    Original Trainer may have a memory leak.
    This is a workaround to avoid storing too many tensors that are not needed.
    """
    pred_ids = torch.argmax(logits[0], dim=-1)
    return pred_ids, labels

In [14]:
BATCH_SIZE = 3
EPOCHS = 2

In [15]:
training_args = TrainingArguments(
    output_dir="t5-wikismall-dir",
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    weight_decay=0.01,
    learning_rate=0.0001,
)
 
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_wikismall,
    eval_dataset=tokenized_valid_wikismall,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics,
    compute_metrics=compute_metrics
)
 
history = trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


  0%|          | 0/59224 [00:00<?, ?it/s]

{'loss': 0.3325, 'learning_rate': 9.915574766986357e-05, 'epoch': 0.02}
{'loss': 0.1157, 'learning_rate': 9.831149533972715e-05, 'epoch': 0.03}
{'loss': 0.1171, 'learning_rate': 9.746724300959072e-05, 'epoch': 0.05}
{'loss': 0.1132, 'learning_rate': 9.662299067945428e-05, 'epoch': 0.07}
{'loss': 0.1094, 'learning_rate': 9.577873834931786e-05, 'epoch': 0.08}
{'loss': 0.1104, 'learning_rate': 9.493448601918141e-05, 'epoch': 0.1}
{'loss': 0.1121, 'learning_rate': 9.409023368904498e-05, 'epoch': 0.12}
{'loss': 0.1096, 'learning_rate': 9.324598135890856e-05, 'epoch': 0.14}
{'loss': 0.1111, 'learning_rate': 9.240172902877212e-05, 'epoch': 0.15}
{'loss': 0.105, 'learning_rate': 9.155747669863569e-05, 'epoch': 0.17}
{'loss': 0.1069, 'learning_rate': 9.071322436849927e-05, 'epoch': 0.19}
{'loss': 0.1064, 'learning_rate': 8.986897203836283e-05, 'epoch': 0.2}
{'loss': 0.108, 'learning_rate': 8.90247197082264e-05, 'epoch': 0.22}
{'loss': 0.1049, 'learning_rate': 8.818046737808996e-05, 'epoch': 0.2

In [16]:
training_args = TrainingArguments(
    output_dir="t5-wikismall-mimic-dir",
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    weight_decay=0.01,
    learning_rate=0.0001,
)
 
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_mimic_train,
    eval_dataset=tokenized_mimic_val,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics,
    compute_metrics=compute_metrics
)
 
history = trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


  0%|          | 0/11668 [00:00<?, ?it/s]

{'loss': 2.822, 'learning_rate': 9.571477545423381e-05, 'epoch': 0.09}
{'loss': 2.4894, 'learning_rate': 9.14295509084676e-05, 'epoch': 0.17}
{'loss': 2.3404, 'learning_rate': 8.714432636270141e-05, 'epoch': 0.26}
{'loss': 2.2888, 'learning_rate': 8.285910181693521e-05, 'epoch': 0.34}
{'loss': 2.2376, 'learning_rate': 7.857387727116902e-05, 'epoch': 0.43}
{'loss': 2.2113, 'learning_rate': 7.428865272540281e-05, 'epoch': 0.51}
{'loss': 2.1929, 'learning_rate': 7.000342817963662e-05, 'epoch': 0.6}
{'loss': 2.1284, 'learning_rate': 6.571820363387042e-05, 'epoch': 0.69}
{'loss': 2.1065, 'learning_rate': 6.143297908810422e-05, 'epoch': 0.77}
{'loss': 2.1391, 'learning_rate': 5.7147754542338014e-05, 'epoch': 0.86}
{'loss': 2.1292, 'learning_rate': 5.286252999657182e-05, 'epoch': 0.94}
{'loss': 2.0801, 'learning_rate': 4.8577305450805626e-05, 'epoch': 1.03}
{'loss': 2.0066, 'learning_rate': 4.4292080905039425e-05, 'epoch': 1.11}
{'loss': 2.0525, 'learning_rate': 4.0006856359273224e-05, 'epoch

In [216]:
MAX_LENGTH = 512

def summarize_text(text):
    # Preprocess the text
    inputs = tokenizer.encode(
        "summarize: " + text,
        return_tensors='pt',
        max_length=MAX_LENGTH,
        truncation=True
    )
    inputs = inputs.to(DEVICE)
    len1 = len(inputs[0])
 
    # Generate the summary
    summary_ids = model.generate(
        inputs,
        exponential_decay_length_penalty=((int) (len1 * 0.8), -1.05),
        encoder_repetition_penalty=0.3,
        no_repeat_ngram_size=4,
        max_length=50,
        num_beams=5,
        temperature=0.9,
    )
 
    # Decode and return the summary
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

In [None]:
model = model.to(DEVICE)
with open("metrics/sample.txt", 'r') as f:
    sample = [l.strip("\n") for l in f.readlines()]
# print(sample)
summary = [summarize_text(l) for l in sample]
for s in summary:
    print(s)

A diabetes is a chronic disease that occurs when the pancreas does not produce enough insulin or when the body can not effectively use the insulin it produces. A chronic disease is a type of diabetes that occurs
Insulin is an hormone that regulates blood glucose.
Hyperglycaemia, also called raised blood glucose or elevated blood sugar, is a common effect of uncontrolled diabetes and over time leads to serious damage to many of the body's systems, especially the nerves and blood vessels.
