## Evaluate Model Performance

In [48]:
from typing import Dict, Tuple
import nltk

from tqdm import tqdm

import pandas as pd
from sklearn.model_selection import train_test_split

import numpy as np
from datasets import Dataset
import evaluate
import torch
import torch.nn as nn
from torch.optim import AdamW
from transformers import Trainer, T5Tokenizer, T5ForConditionalGeneration, TrainingArguments
from transformers import StoppingCriteria, StoppingCriteriaList
from transformers import BertModel, BertTokenizer, BertLMHeadModel, AutoModelForCausalLM, AutoTokenizer, BartModel, BartForConditionalGeneration
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

In [49]:
import eval_metrics as metrics

In [50]:
MIMIC_DATA_USAGE_ROWS = 5000
mimic_df = pd.read_csv("datasets/mimic-iv-ext-bhc/mimic-iv-bhc.csv", nrows=MIMIC_DATA_USAGE_ROWS)
mimic_df.rename(columns={'input': 'source'}, inplace=True)

mimic_sample = mimic_df.loc[0]['source']
mimic_target = mimic_df.loc[0]['target']

mimic_sample2 = mimic_df.loc[2]['source']
mimic_target2 = mimic_df.loc[2]['target']

In [51]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device {DEVICE}")

Using device cuda


### T5

In [52]:
model_path = f"models/checkpoint-11500"

model = T5ForConditionalGeneration.from_pretrained(model_path)
tokenizer = T5Tokenizer.from_pretrained('t5-base')
model = model.to(DEVICE)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [53]:
MAX_LENGTH = 512

def summarize_text(text):
    inputs = tokenizer.encode(
        "summarize: " + text,
        return_tensors='pt',
        max_length=MAX_LENGTH,
        truncation=True
    )
    inputs = inputs.to(DEVICE)
 
    summary_ids = model.generate(
        inputs,
        encoder_repetition_penalty=1.8,
        no_repeat_ngram_size=3,
        max_length=50,
        num_beams=3,
        top_k=50,
    )
    
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    
    if '.' in summary:
        summary = summary.split('.')[0] + '.'

    return summary  

In [54]:
p1a = []
r1a = []
f1a = []
p2a = []
r2a = []
f2a = []
fka = []


for i in range(1000):
    sample = mimic_df.loc[i]['source']
    target = mimic_df.loc[i]['target']
    sum = summarize_text(sample)
    p1a.append(metrics.rouge_score(sample, sum, 1)['rouge1'][0])
    r1a.append(metrics.rouge_score(sample, sum, 1)['rouge1'][1])
    f1a.append(metrics.rouge_score(sample, sum, 1)['rouge1'][2])
    
    p2a.append(metrics.rouge_score(sample, sum, 2)['rouge2'][0])
    r2a.append(metrics.rouge_score(sample, sum, 2)['rouge2'][1])
    f2a.append(metrics.rouge_score(sample, sum, 2)['rouge2'][2])
    
    try:
        fka.append(metrics.flesch_kincaid(sum) - metrics.flesch_kincaid(target))
    except:
        pass
    
# rouge1 precision, recall, f1
print(np.mean(p1a))
print(np.mean(r1a))
print(np.mean(f1a))
# rouge2 precision, recall, f1
print(np.mean(p2a))
print(np.mean(r2a))
print(np.mean(f2a))
# delta flesch kincaid
print(np.mean(fka))

KeyboardInterrupt: 

### BART

In [36]:
model_path = f"models/checkpoint-1000"
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = model.to(DEVICE)

In [37]:
MAX_LENGTH = 1024

def summarize_text(text):
    # Preprocess the text
    inputs = tokenizer.encode(
        "summarize: " + text,
        return_tensors='pt',
        max_length=MAX_LENGTH,
        truncation=True
    )
    inputs = inputs.to(DEVICE)
 
    # Generate the summary
    summary_ids = model.generate(
        inputs,
        # encoder_repetition_penalty=0.9,
        no_repeat_ngram_size=4,
        max_length=750,
        num_beams=5,
        temperature=1,
    )
 
    # Decode and return the summary
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

In [None]:
p1a = []
r1a = []
f1a = []
p2a = []
r2a = []
f2a = []
fka = []


for i in range(1000):
    sample = mimic_df.loc[i]['source']
    target = mimic_df.loc[i]['target']
    sum = summarize_text(sample)
    p1a.append(metrics.rouge_score(sample, sum, 1)['rouge1'][0])
    r1a.append(metrics.rouge_score(sample, sum, 1)['rouge1'][1])
    f1a.append(metrics.rouge_score(sample, sum, 1)['rouge1'][2])
    
    p2a.append(metrics.rouge_score(sample, sum, 2)['rouge2'][0])
    r2a.append(metrics.rouge_score(sample, sum, 2)['rouge2'][1])
    f2a.append(metrics.rouge_score(sample, sum, 2)['rouge2'][2])
    
    try:
        fka.append(metrics.flesch_kincaid(sum) - metrics.flesch_kincaid(target))
    except:
        pass
    
# rouge1 precision, recall, f1
print(np.mean(p1a))
print(np.mean(r1a))
print(np.mean(f1a))
# rouge2 precision, recall, f1
print(np.mean(p2a))
print(np.mean(r2a))
print(np.mean(f2a))
# delta flesch kincaid
print(np.mean(fka))

KeyboardInterrupt: 

In [39]:
print(fka)

[0.17743931475028774, -1.5564432539317536]
