In [1]:
from typing import Dict, Tuple
import nltk

from tqdm import tqdm

import pandas as pd
from sklearn.model_selection import train_test_split

import numpy as np
from datasets import Dataset
import evaluate
import torch
import torch.nn as nn
from torch.optim import AdamW
from transformers import Trainer, T5Tokenizer, T5ForConditionalGeneration, TrainingArguments
from transformers import StoppingCriteria, StoppingCriteriaList
from transformers import BertModel, BertTokenizer, BertLMHeadModel, AutoModelForCausalLM, AutoTokenizer, BartModel, BartForConditionalGeneration
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [2]:
max_input = 512
max_target = 128
batch_size = 3

In [3]:
model_path = "facebook/bart-large-xsum"

model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

  return torch.load(checkpoint_file, map_location=map_location)


In [4]:
wikismall_train_df = pd.read_csv("datasets/wikismall/train.csv")
wikismall_val_df = pd.read_csv("datasets/wikismall/val.csv")

wikismall_train = Dataset.from_pandas(wikismall_train_df)
wikismall_val = Dataset.from_pandas(wikismall_val_df)

print(wikismall_train)
print(wikismall_val)

Dataset({
    features: ['source', 'target'],
    num_rows: 88836
})
Dataset({
    features: ['source', 'target'],
    num_rows: 205
})


In [5]:
MIMIC_DATA_USAGE_ROWS = 3000
MIMIC_VAL_SIZE = 0.3
MIMIC_TEST_SIZE = 0.5
RANDOM_STATE = 42

mimic_df = pd.read_csv("datasets/mimic-iv-ext-bhc/mimic-iv-bhc.csv", nrows=MIMIC_DATA_USAGE_ROWS)
mimic_df.rename(columns={'input': 'source'}, inplace=True)

mimic_train_df, mimic_rest_df = train_test_split(mimic_df, test_size=MIMIC_VAL_SIZE, random_state=RANDOM_STATE)
mimic_val_df, mimic_test_df = train_test_split(mimic_rest_df, test_size=MIMIC_TEST_SIZE, random_state=RANDOM_STATE)

mimic_train = Dataset.from_pandas(mimic_train_df)
mimic_val = Dataset.from_pandas(mimic_val_df)
mimic_test = Dataset.from_pandas(mimic_test_df)

print(mimic_train)
print(mimic_val)
print(mimic_test)

Dataset({
    features: ['note_id', 'source', 'target', 'input_tokens', 'target_tokens', '__index_level_0__'],
    num_rows: 2100
})
Dataset({
    features: ['note_id', 'source', 'target', 'input_tokens', 'target_tokens', '__index_level_0__'],
    num_rows: 450
})
Dataset({
    features: ['note_id', 'source', 'target', 'input_tokens', 'target_tokens', '__index_level_0__'],
    num_rows: 450
})


In [6]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device {DEVICE}")

Using device cuda


In [7]:
MAX_LENGTH_MIMIC = 1024
def preprocess(dataset):
    sources = [f"summarize: {source}" for source in dataset['source']]
    tokens = tokenizer(sources, max_length=MAX_LENGTH_MIMIC, truncation=True, padding='max_length')
 
    targets = [target for target in dataset['target']]
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=MAX_LENGTH_MIMIC, truncation=True, padding='max_length')
 
    tokens["labels"] = labels["input_ids"]
    return tokens

tokenized_mimic_train = mimic_train.map(preprocess, batched=True)
tokenized_mimic_val = mimic_val.map(preprocess, batched=True)
tokenized_mimic_test = mimic_test.map(preprocess, batched=True)

Map:   0%|          | 0/2100 [00:00<?, ? examples/s]



Map:   0%|          | 0/450 [00:00<?, ? examples/s]

Map:   0%|          | 0/450 [00:00<?, ? examples/s]

In [8]:
model.to(DEVICE)

BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(50264, 1024, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50264, 1024, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_norm): LayerN

In [9]:
# https://github.com/huggingface/transformers/blob/main/examples/pytorch/summarization/run_summarization.py

metric = evaluate.load("rouge")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

    return preds, labels
 
def compute_metrics(eval_preds):
        preds, labels = eval_preds
        if isinstance(preds, tuple):
            preds = preds[0]
        # Replace -100s used for padding as we can't decode them
        preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
        decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

        # Some simple post-processing
        decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

        result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
        result = {k: round(v * 100, 4) for k, v in result.items()}
        prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
        result["gen_len"] = np.mean(prediction_lens)
        return result


In [10]:
# Memory leak bug fix: 
# Source: https://discuss.huggingface.co/t/cuda-out-of-memory-when-using-trainer-with-compute-metrics/2941
def preprocess_logits_for_metrics(logits, labels):
    """
    Original Trainer may have a memory leak.
    This is a workaround to avoid storing too many tensors that are not needed.
    """
    pred_ids = torch.argmax(logits[0], dim=-1)
    return pred_ids, labels

In [11]:
BATCH_SIZE = 3
EPOCHS = 2

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="bart-mimic-dir",
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    weight_decay=0.01,
    learning_rate=0.0001,
    save_total_limit=2,
    predict_with_generate=True,
)
 
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_mimic_train,
    eval_dataset=tokenized_mimic_val,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)
 
history = trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


  0%|          | 0/1400 [00:00<?, ?it/s]

You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 1.6723, 'learning_rate': 6.428571428571429e-05, 'epoch': 0.71}
{'loss': 1.2859, 'learning_rate': 2.857142857142857e-05, 'epoch': 1.43}
{'train_runtime': 10029.5683, 'train_samples_per_second': 0.419, 'train_steps_per_second': 0.14, 'train_loss': 1.3897557285853794, 'epoch': 2.0}


In [17]:
model_path = f"bart-mimic-dir/checkpoint-1000"
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [None]:
MAX_LENGTH = 512

def summarize_text(text):
    # Preprocess the text
    inputs = tokenizer.encode(
        "summarize: " + text,
        return_tensors='pt',
        max_length=MAX_LENGTH,
        truncation=True
    )
    inputs = inputs.to(DEVICE)
    len1 = len(inputs[0])
 
    # Generate the summary
    summary_ids = model.generate(
        inputs,
        # exponential_decay_length_penalty=((int) (len1 * 0.8), -1.05),
        # encoder_repetition_penalty=0.7,
        no_repeat_ngram_size=4,
        max_length=50,
        num_beams=5,
        temperature=1,
    )
 
    # Decode and return the summary
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

In [22]:
model = model.to(DEVICE)
with open("sample.txt", 'r') as f:
    sample = [l.strip("\n") for l in f.readlines()]
# print(sample)
summary = [summarize_text(l) for l in sample]
for s in summary:
    print(s)

Mr. ___ is a ___ year old man with a history of type 2 diabetes who presents with an inability to effectively use insulin. He was evaluated by the ___ service and found to be at an increased risk of harm to himself and others.
Mr. ___ is a ___ year old man with a history of diabetes who was found to be hyperglycemic on the day of admission. He was treated with IVF and his blood sugars improved. He was discharged home in stable condition.
Mr. ___ is a ___ year old man with a history of uncontrolled diabetes and over time leads to serious damage to many of the body's systems, especially the nerves and blood vessels. He was found to have hyperglycaemia. He
