# Script to Determine Hallucination Detection Performance based on Medcat UMLS Entities and Embeddings

In [1]:
import json
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import wordpunct_tokenize
import nltk
nltk.download('punkt')
from nervaluate import Evaluator

[nltk_data] Downloading package punkt to /home/s_hegs02/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# Define files and parameters
bioc_labelled_hallucinations_10_valid_mimic_summaries_path = '/home/s_hegs02/MedTator/13_agreed_label_silver_validation_examples/hallucinations_10_valid_mimic_agreed.jsonl'
bioc_labelled_hallucinations_100_mimic_summaries_path = '/home/s_hegs02/MedTator/12_agreed_label_silver_examples/hallucinations_100_mimic_agreed.jsonl'
# TODO: Replace with the agreed dataset
bioc_labelled_hallucinations_100_generated_summaries = '/home/s_hegs02/MedTator/20_label_halus_qualitatative_annotator_1/hallucinations_100_generated_annotator_1.jsonl'
dataset_paths = {'valid_mimic': bioc_labelled_hallucinations_10_valid_mimic_summaries_path, 'test_mimic': bioc_labelled_hallucinations_100_mimic_summaries_path, 'test_generated': bioc_labelled_hallucinations_100_generated_summaries}

annotated_labels = {'c': 'condition_unsupported', 'p': 'procedure_unsupported', 'm': 'medication_unsupported', 't': 'time_unsupported', 'l': 'location_unsupported', 'n': 'number_unsupported', 'na': 'name_unsupported', 'w': 'word_unsupported', 'o': 'other_unsupported', 'co': 'contradicted_fact', 'i': 'incorrect_fact'}

# Load dataset
def read_jsonl(path):
    input = []
    with open(path) as f:
        for line in f:
            input.append(json.loads(line))
    return input

datasets = {k: read_jsonl(v) for k, v in dataset_paths.items()}


entities_embeddings_paths = {
    'valid_mimic': '/home/s_hegs02/mimic-iv-note-di-bhc/entities/medcat_entities_sapbert_embeddings_valid_mimic_umls_large.json',
    'test_mimic': '/home/s_hegs02/mimic-iv-note-di-bhc/entities/medcat_entities_sapbert_embeddings_test_mimic_umls_large.json',
    'test_generated': '/home/s_hegs02/mimic-iv-note-di-bhc/entities/medcat_entities_sapbert_embeddings_test_generated_umls_large.json'
}
# Read the data
entities_embeddings = {
    'valid_mimic': pd.read_json(entities_embeddings_paths['valid_mimic']),
    'test_mimic': pd.read_json(entities_embeddings_paths['test_mimic']),
    'test_generated': pd.read_json(entities_embeddings_paths['test_generated'])
}

In [3]:
# Determine hallucination for dataset
evaluate_datesets = ['valid_mimic', 'test_mimic', 'test_generated']

hallucination_label = 'hallucination'

In [4]:
# Tuning threshold and embeddings on pretty_name_embedding/source_value_embedding on valid_mimic using partial f1
# For thresholds:           0.40, 0.45, 0.50, 0.55, 0.60, 0.65, 0.70, 0.75, 0.80, 0.85, 0.90, 0.95, 1.00
# pretty_name_embedding:     0.0,  0.0,  4.8,  6.0,  5.3,  5.3,  4.9,  6.2,  7.6,  7.6,  7.6,  7.6,  7.6
# source_value_embedding:    0.0,  0.0,  0.0,  5.4,  6.2,  5.7,  7.1,  7.8,  7.7,  7.7,  7.7,  7.7,  7.7
# Since source_value_embedding has the best performance additional values tested
# For thresholds:           0.70, 0.71, 0.72, 0.73, 0.74, 0.75, 0.76, 0.77, 0.78, 0.79, 0.80
# source_value_embedding:    7.1,  6.8,  6.8,  6.6,  6.5,  7.8,  7.8,  7.8,  7.7,  7.7,  7.7
# So choose 0.76 as the threshold for source_value_embedding

embedding_equivalence_threshold = 0.76
embedding_field = 'source_value_embedding'

def entities_equal(entity1, entity2, mode):
    if mode == 'cui':
        return entity1['cui'] == entity2['cui']
    elif mode == 'cui_embedding':
        if entity1['cui'] == entity2['cui']:
            return True
        else:
            return cosine_similarity([entity1[embedding_field]], [entity2[embedding_field]])[0][0] >= embedding_equivalence_threshold

In [5]:
# Determine hallucination for each summary
dataset_results = {}
entity_match_mode = 'cui'
# Also use embeddings
# entity_match_mode = 'cui_embedding'

for dataset_name in evaluate_datesets:
    num_examples = len(entities_embeddings[dataset_name])
    print(f"Determining hallucinations for {num_examples} examples in {dataset_name}...")

    assert len(entities_embeddings[dataset_name]) == len(datasets[dataset_name])

    def convert_entity_to_label(entity):
        return {'start': entity['start'], 'end': entity['end'], 'length': entity['end'] - entity['start'], 'label': hallucination_label, 'text': entity['source_value']}

    results = []
    for ex_id in range(num_examples):
        text = datasets[dataset_name][ex_id]['text']
        summary = datasets[dataset_name][ex_id]['summary']
        labels = []
        summary_entities = entities_embeddings[dataset_name].iloc[ex_id]['summary'][0]['entities']
        text_entities = entities_embeddings[dataset_name].iloc[ex_id]['text'][0]['entities']
        for _, summary_entity in summary_entities.items():
            is_hallucination = True
            for _, text_entity in text_entities.items():
                if entities_equal(summary_entity, text_entity, mode=entity_match_mode):
                    is_hallucination = False
                    break
            label = convert_entity_to_label(summary_entity)
            assert summary[label['start']:label['end']] == label['text']
            if is_hallucination:
                labels.append(label)
        results.append({'text': text, 'summary': summary, 'labels': labels})
        # Debug
        # print(f"Ex {ex_id} found hallucinations: {len(labels)}")
        # print({'text': text, 'summary': summary, 'labels': labels})
    print(f"\tFound {sum([len(r['labels']) for r in results])} hallucinations.")
    dataset_results[dataset_name] = results

Determining hallucinations for 10 examples in valid_mimic...
	Found 43 hallucinations.
Determining hallucinations for 100 examples in test_mimic...
	Found 635 hallucinations.
Determining hallucinations for 100 examples in test_generated...
	Found 743 hallucinations.


In [6]:
# Evaluate hallucination detection

# Define tokenizer
tokenizer = lambda x: wordpunct_tokenize(x)

def character_labels_to_word_labels(text, labels):
    # Convert character level labels to word level labels
    new_labels = []
    for label in labels:
        new_label = {'label': label['label']}
        new_label['start'] = len(tokenizer(text[:label['start']]))
        new_label['end'] = new_label['start'] + len(tokenizer(label['text']))
        new_label['length'] = new_label['end'] - new_label['start']
        # Copy over old text because not tokenized version, but check it contains same text without whitespace
        new_label['text'] = label['text']
        # Debug
        # print(f"Old chars: {label['text'].replace(' ', '')} -> New chards: {''.join(tokenizer(text)[new_label['start']:new_label['end']])}")
        # print(f"Old label: {label} -> New label: {new_label}")
        assert ''.join(tokenizer(text)[new_label['start']:new_label['end']]) == label['text'].replace(' ', '')
        new_labels.append(new_label)
    return new_labels

# Convert all labelings to word level
for dataset_name in evaluate_datesets:
    for ex in dataset_results[dataset_name]:
        ex['token_labels'] = character_labels_to_word_labels(ex['summary'], ex['labels'])

for dataset_name, dataset in datasets.items():
    for ex in dataset:
        ex['token_labels'] = character_labels_to_word_labels(ex['summary'], ex['labels'])
        

In [7]:
# Evaluate hallucination detection
for dataset_name in evaluate_datesets:
    print(f"Evaluating hallucination detection on {dataset_name}...")
    true = [ex['token_labels'] for ex in datasets[dataset_name]]
    pred = [ex['token_labels'] for ex in dataset_results[dataset_name]]
    # All labels
    evaluator = Evaluator(true, pred, tags=[hallucination_label] + list(annotated_labels.values()))
    results, results_per_tag = evaluator.evaluate()

    print("Raw results:")
    print(results)
    print(results_per_tag)
    print() 

    # Get partial precision, recall, f1 across all labels as latex row
    error_type = 'partial'
    performance_measures = ['precision', 'recall', 'f1']
    print(f"Results for {error_type} on {dataset_name} ({performance_measures}):")
    print(f"{entity_match_mode} ({error_type}) & " + " & ".join([f"${results[error_type][measure]*100:.1f}$" for measure in performance_measures]) + " \\\\")
    
    # Get recall for hallucination label
    error_type = 'partial'
    performance_measures = 'recall'
    print(f"Results for {error_type} on {dataset_name} for all labels ({performance_measures}):")
    performances = [results_per_tag[label][error_type][performance_measures] for label in list(annotated_labels.values())]
    print(f"{entity_match_mode} ({error_type}) & " + " & ".join([f"${performance*100:.1f}$" for performance in performances]) + " \\\\")
    print()

Evaluating hallucination detection on valid_mimic...
Raw results:
{'ent_type': {'correct': 0, 'incorrect': 4, 'partial': 0, 'missed': 19, 'spurious': 39, 'possible': 23, 'actual': 43, 'precision': 0.0, 'recall': 0.0, 'f1': 0}, 'partial': {'correct': 1, 'incorrect': 0, 'partial': 3, 'missed': 19, 'spurious': 39, 'possible': 23, 'actual': 43, 'precision': 0.05813953488372093, 'recall': 0.10869565217391304, 'f1': 0.07575757575757576}, 'strict': {'correct': 0, 'incorrect': 4, 'partial': 0, 'missed': 19, 'spurious': 39, 'possible': 23, 'actual': 43, 'precision': 0.0, 'recall': 0.0, 'f1': 0}, 'exact': {'correct': 1, 'incorrect': 3, 'partial': 0, 'missed': 19, 'spurious': 39, 'possible': 23, 'actual': 43, 'precision': 0.023255813953488372, 'recall': 0.043478260869565216, 'f1': 0.030303030303030304}}
{'hallucination': {'ent_type': {'correct': 0, 'incorrect': 0, 'partial': 0, 'missed': 0, 'spurious': 39, 'possible': 0, 'actual': 39, 'precision': 0.0, 'recall': 0, 'f1': 0}, 'partial': {'correct'

In [8]:
# Toy evaluation example example
# Documentation find here: https://pypi.org/project/nervaluate/
# Distinguish for different types
# - Type	some overlap between the system tagged entity and the gold annotation is required
# - Partial	partial boundary match over the surface string, regardless of the type
# - Strict	exact boundary surface string match and entity type
# - Exact	exact boundary match over the surface string, regardless of the type
# In this setting partial sufficient because baseline does not predict type and this should give us an upper bound of the performance 

true = [[
    {'start': 10, 'end': 11, 'length': 1,  'label': 'medication_unsupported', 'text': 'motrin'},
    {'start': 20, 'end': 21, 'length': 2, 'label': 'medication_unsupported', 'text': 'received antibiotics'},
    {'start': 30, 'end': 31, 'length': 3, 'label': 'condition_unsupported', 'text': 'breast cancer metastatic'}
]]
pred = [[
    {'start': 10, 'end': 12, 'length': 1,  'label': 'condition_unsupported', 'text': 'motrin'},
    {'start': 20, 'end': 21, 'length': 2, 'label': 'medication_unsupported', 'text': 'received antibiotics'},
    {'start': 30, 'end': 31, 'length': 3, 'label': 'condition_unsupported', 'text': 'breast cancer metastatic'}
]]
evaluator = Evaluator(true, pred, tags=['condition_unsupported', 'medication_unsupported'])
results, results_per_tag = evaluator.evaluate()
# print(json.dumps(results, indent=4))
# print(json.dumps(results_per_tag, indent=4))

print("Simplified setting with only hallucination_label")
for ex in true:
    for label in ex:
        label['label'] = hallucination_label
for ex in pred:
    for label in ex:
        label['label'] = hallucination_label
        
evaluator = Evaluator(true, pred, tags=[hallucination_label])
results, results_per_tag = evaluator.evaluate()
# print(json.dumps(results, indent=4))
# print(json.dumps(results_per_tag, indent=4))

Simplified setting with only hallucination_label
