In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
# !pip install datasets
# !pip install evaluate

In [1]:
from datasets import load_dataset
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
import evaluate
import torch
import pandas as pd
import numpy as np

In [4]:
# ds = load_dataset("har1/MTS_Dialogue-Clinical_Note", streaming=False)
ds = load_dataset("ryanwtsai/MTS_Dialogue-Clinical_Note_OriginalTrainValSplit", streaming=False)
seq1id = 'dialogue'
seq2id = 'section_text'
ds

DatasetDict({
    train: Dataset({
        features: ['ID', 'section_header', 'section_text', 'dialogue', 'input_ids', 'attention_mask', 'length', 'labels'],
        num_rows: 1200
    })
    val: Dataset({
        features: ['ID', 'section_header', 'section_text', 'dialogue', 'input_ids', 'attention_mask', 'length', 'labels'],
        num_rows: 100
    })
})

In [7]:
device = 0 if torch.cuda.is_available() else -1

In [8]:
summarizer = pipeline("text2text-generation", model="har1/HealthScribe-Clinical_Note_Generator", device=device)

In [9]:
bertscore = evaluate.load('bertscore')
rouge = evaluate.load('rouge')
print(bertscore.__class__.__name__)
print(rouge.__class__.__name__)

BERTScore
Rouge


In [10]:
ds_test = ds['val']
enc_seqs = []
predictions = []
references = []
for sdx in range(ds_test.num_rows):
    print(f"Sample {sdx}")
    enc_seq = ds_test[sdx][seq1id]
    gnd_truth = ds_test[sdx][seq2id]
    pred = summarizer(enc_seq, truncation=True)
    predictions.append(pred[0]['generated_text'])
    references.append(gnd_truth)
    enc_seqs.append(enc_seq)

print("Finished predictions)

  attn_output = torch.nn.functional.scaled_dot_product_attention(
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [26]:
result = bertscore.compute(predictions=predictions, references=references, model_type='distilbert-base-uncased')
result.pop('hashcode')
result = {f'bertscore_{k}': np.mean(np.array(v)) for k, v in result.items()}
print(result)

{'bertscore_precision': 0.8690398466587067, 'bertscore_recall': 0.9105064260959626, 'bertscore_f1': 0.8888216495513916}


In [27]:
result = rouge.compute(predictions=predictions, references=references)
result = {f'{k}': v for k, v in result.items()}
print(result)

{'rouge1': 0.5424230439097715, 'rouge2': 0.3391861170607466, 'rougeL': 0.4627998509878562, 'rougeLsum': 0.5123052699297725}


In [24]:
random_idx = np.random.randint(0, ds_test.num_rows, 10)
for i in random_idx:
    print("Input:")
    print(enc_seqs[i])
    print("")
    print("Ground truth:")
    print(references[i])
    print("")
    print("Predictions:")
    print(predictions[i])
    print("****************************************************************")

Input:
Doctor: Do you have any prior history of surgeries? 
Patient: I had surgery on my back and shoulder after a bad skiing accident.  
Doctor: How long ago did you have those surgeries? 
Patient: About three to four years ago. Oh, I had my appendix removed when I was a teenager.

Ground truth:
Symptoms: N/A
Diagnosis: N/A
History of Complaint: Back surgery and shoulder surgery performed approximately three to four years ago, appendicectomy during teenage years
Plan of Action: N/A


Predictions:
Symptoms: N/A
Diagnosis: None
History of Patient: Surgery on back and shoulder after skiing accident 3-4 years ago, appendectomy as a teenager
Plan of Action: Surgery for shoulder and back injuries, evaluation and management of postoperative pain

****************************************************************
Input:
Doctor: Have you had any thoughts of harming yourself or others? 
Patient: I've had thoughts of not wanting to be alive anymore but no plans of actually hurting myself. And abso