# Create JSON datasets from raw bioc labelings

In [1]:
import pandas as pd
import json
import re
from utils import read_bioc, parse_text_labels
from pathlib import Path

In [2]:
# Define files and parameters
data_path = '/home/s_hegs02/MedTator'
data_path = Path(data_path)

dataset_paths = {
    # Experiment 1: label mimic summaries
    'hallucinations_100_mimic_annotator_1': data_path / '10_label_silver_examples_annotator_1' / 'hallucinations_100_mimic_annotator_1.xml',
    'hallucinations_100_mimic_annotator_2': data_path / '11_label_silver_examples_annotator_2' / 'hallucinations_100_mimic_annotator_2.xml',
    'hallucinations_100_mimic_agreed': data_path / '12_agreed_label_silver_examples' / 'hallucinations_100_mimic_agreed.xml',
    'hallucinations_10_valid_mimic_agreed': data_path / '13_agreed_label_silver_validation_examples' / 'hallucinations_10_valid_mimic_agreed.xml',
    # Experiment 2: label generated summaries
    'hallucinations_100_generated_annotator_1': data_path / '20_label_halus_qualitative_annotator_1' / 'hallucinations_100_generated_annotator_1.xml',
    'hallucinations_100_generated_annotator_2': data_path / '21_label_halus_qualitative_annotator_2' / 'hallucinations_100_generated_annotator_2.xml',
    'hallucinations_100_generated_agreed': data_path / '22_label_halus_qualitative_agreed' / 'hallucinations_100_generated_agreed.xml',
}

# Randomization for Experiment 2
hallucination_random_models = {0: [0, 4, 3, 1, 2], 1: [4, 1, 3, 2, 0], 2: [0, 2, 1, 3, 4], 3: [1, 0, 3, 4, 2], 4: [3, 2, 4, 1, 0], 5: [3, 2, 1, 4, 0], 6: [1, 2, 3, 0, 4], 7: [1, 3, 2, 4, 0], 8: [4, 0, 1, 3, 2], 9: [0, 3, 2, 4, 1], 10: [0, 4, 3, 2, 1], 11: [1, 0, 4, 3, 2], 12: [2, 4, 1, 0, 3], 13: [3, 1, 0, 4, 2], 14: [4, 2, 0, 1, 3], 15: [0, 2, 4, 3, 1], 16: [1, 4, 2, 3, 0], 17: [2, 3, 1, 0, 4], 18: [4, 0, 3, 2, 1], 19: [0, 3, 1, 2, 4], 20: [4, 0, 2, 3, 1], 21: [0, 4, 2, 1, 3], 22: [0, 2, 4, 3, 1], 23: [1, 0, 3, 4, 2], 24: [3, 1, 0, 4, 2], 25: [2, 0, 3, 4, 1], 26: [4, 3, 0, 1, 2], 27: [3, 4, 2, 1, 0], 28: [4, 2, 3, 1, 0], 29: [4, 1, 3, 0, 2], 30: [2, 3, 0, 1, 4], 31: [4, 2, 0, 3, 1], 32: [3, 0, 2, 1, 4], 33: [2, 3, 4, 1, 0], 34: [4, 1, 3, 2, 0], 35: [0, 4, 1, 3, 2], 36: [4, 1, 3, 0, 2], 37: [3, 1, 0, 4, 2], 38: [3, 2, 4, 1, 0], 39: [1, 0, 3, 4, 2], 40: [4, 3, 0, 1, 2], 41: [2, 3, 4, 0, 1], 42: [2, 4, 3, 1, 0], 43: [4, 1, 2, 0, 3], 44: [0, 4, 3, 1, 2], 45: [3, 2, 0, 1, 4], 46: [2, 4, 0, 3, 1], 47: [2, 1, 0, 4, 3], 48: [4, 2, 3, 1, 0], 49: [3, 1, 4, 2, 0]}

# Define markers
re_text_start_mimic_old_key = re.compile('### JSON Key: text\n', re.MULTILINE)
re_summary_start_mimic_old_key = re.compile('### JSON Key: summary\n', re.MULTILINE)
re_text_start_mimic = re.compile('Text:\n', re.MULTILINE)
re_summary_start_mimic = re.compile('Summary:\n', re.MULTILINE)
re_text_start_generated = re.compile('Text:\n', re.MULTILINE)
re_summary_start_generated = re.compile('Summary \d:\n', re.MULTILINE)
markers = {k: (re_text_start_mimic, re_summary_start_mimic) if 'mimic' in k else (re_text_start_generated, re_summary_start_generated) for k in dataset_paths.keys()}
# This two medtator datasets still used the old key
markers['hallucinations_100_mimic_annotator_1'] = (re_text_start_mimic_old_key, re_summary_start_mimic_old_key)
markers['hallucinations_100_mimic_annotator_2'] = (re_text_start_mimic_old_key, re_summary_start_mimic_old_key)

In [3]:
datasets_bioc = {k: read_bioc(v) for k, v in dataset_paths.items()}
datasets_unprocessed = {k: parse_text_labels(v) for k, v in datasets_bioc.items()}
datasets = {k: [] for k in datasets_unprocessed.keys()}

# Print included ids
for k, v in datasets_unprocessed.items():
    print(f"{k}: {len(v)} examples")

In [9]:
# The datasets still contain the text (BHC) and the summary (discharge instructions) and the label positions are based on both texts.
# Additionally, the generated examples contain one text and 5 randomized generations
# Must split this data and correct the label positions to be based on the summaries only

In [5]:
# 1. Labeling: First split text and summaries in mimic examples and correct label positions

for dataset_name in [k for k in datasets_unprocessed.keys() if 'mimic' in k]:
    # print(k)
    # Get all keys sorted
    sorted_keys = (list(datasets_unprocessed[dataset_name].keys()))
    sorted_keys.sort()
    for key in sorted_keys:
        # print(key)
        # print(example)
        re_text_start, re_summary_start = markers[dataset_name]
        example = datasets_unprocessed[dataset_name][key]
        text_start = re_text_start.search(example['text']).span()[1]
        text_end = re_summary_start.search(example['text']).span()[0]
        summary_start = re_summary_start.search(example['text']).span()[1]

        text = example['text'][text_start:text_end].strip()
        summary = example['text'][summary_start:].rstrip()
        assert len(summary.lstrip()) == len(summary)
        # Debug
        # print(text)
        # print(summary)
        
        label_offset = summary_start
        labels = []
        for label in example['labels']:
            new_label = label.copy()
            new_label['start'] -= label_offset
            new_label['end'] -= label_offset
            # print(label, new_label)
            # Verify correct label
            assert example['text'][label['start']:label['end']] == label['text']
            assert summary[new_label['start']:new_label['end']] == label['text']
            labels.append(new_label)
            
        datasets[dataset_name].append({'text': text, 'summary': summary, 'labels': labels})
    print(f"Added {len(datasets[dataset_name])} examples with {sum([len(ex['labels']) for ex in datasets[dataset_name]])} labels to {dataset_name}")

Added 100 examples with 239 labels to hallucinations_100_mimic_annotator_1
Added 100 examples with 282 labels to hallucinations_100_mimic_annotator_2
Added 100 examples with 286 labels to hallucinations_100_mimic_agreed
Added 10 examples with 23 labels to hallucinations_10_valid_mimic_agreed


In [6]:
# 2. Labeling: Second de-randomize generated summaries and put them into separate text-summary

for dataset_name in [k for k in datasets_unprocessed.keys() if 'generated' in k]:

    num_unfolded_generated_examples = 0
    unfolded_generated_examples = [[] for i in range(5)]
    for id, example in datasets_unprocessed[dataset_name].items():
        # Get text entry between re_text_start_generated and re_summary_start_generated
        source = example['text']
        labels = example['labels']
        text = source[re_text_start_generated.search(source).end():re_summary_start_generated.search(source).start()].strip()
        summaries_labels = []
        
        source_summaries_offset = re_summary_start_generated.search(source).start()
        summary_label_len = len('Summary X:\n')
        source_summaries = source[source_summaries_offset:]
        # Get all positions of re_text_start_generated
        summary_start_positions = [m.start() for m in re_summary_start_generated.finditer(source_summaries)] + [len(source_summaries)]
        summary_lens = [summary_start_positions[i+1] - summary_start_positions[i] for i in range(5)]
        
        # print(summary_start_positions)
        # for i in range(5):
        #     print('---' + source_summaries[summary_start_positions[i] + summary_label_len:summary_start_positions[i+1]] + '---')
        
        randomized_summaries_labels = []
        processed_labels = []
        for i in range(5):
            summary_content_start = source_summaries_offset + summary_start_positions[i] + summary_label_len
            summary_content_end = source_summaries_offset + summary_start_positions[i+1]
            summary = source[summary_content_start:summary_content_end]
            summaries_labels = []

            # Get all labels for this summary
            for label in labels:
                if label['start'] >= summary_content_start and label['end'] <= summary_content_end:
                    # Verify labe;
                    assert source[label['start']:label['end']] == label['text']
                    # Copy label
                    new_label = label.copy()
                    # Correct the label position
                    new_label['start'] = label['start'] - summary_content_start
                    new_label['end'] = label['end'] - summary_content_start
                    # Check label at correct position in extracted summary
                    assert summary[new_label['start']:new_label['end']] == label['text']
                    summaries_labels.append(new_label)
                    processed_labels.append(label)
            randomized_summaries_labels.append({'summary': summary, 'labels': summaries_labels})
            
        # Check that all labels were processed
        assert processed_labels == labels
        assert sum([len(ex['labels']) for ex in randomized_summaries_labels]) == len(labels)
        # Check all cahracter of source were processed
        assert source_summaries_offset + sum([len(ex['summary']) for ex in randomized_summaries_labels]) + 5 * summary_label_len == len(source)
        # Now remove trailing whitespaces for summaries and chek no leading whitespaces
        for i in range(5):
            assert len(randomized_summaries_labels[i]['summary']) == len(randomized_summaries_labels[i]['summary'].lstrip())
            randomized_summaries_labels[i]['summary'] = randomized_summaries_labels[i]['summary'].rstrip()
            
        # De-randomize
        summaries_labels = [''] * 5
        for i in range(5):
            summaries_labels[hallucination_random_models[id][i]] = randomized_summaries_labels[i]
        assert [e != '' for e in summaries_labels].count(True) == 5
        
        # Debug:
        # for e in summaries_labels:
        #     print(e['summary'])
        #     print(e['labels'])
        # print('---')
        
        # Move examples with text-summary format into unpacked
        for i in range(5):
            unfolded_generated_examples[i].append({'text': text, 'summary': summaries_labels[i]['summary'], 'labels': summaries_labels[i]['labels']})
            num_unfolded_generated_examples += 1
            
    # Combine all lists into one
    assert num_unfolded_generated_examples == 5 * len(datasets_unprocessed[dataset_name])
    datasets[dataset_name] = unfolded_generated_examples[0] + unfolded_generated_examples[1] + unfolded_generated_examples[2] + unfolded_generated_examples[3] + unfolded_generated_examples[4]
    print(f"Added {len(datasets[dataset_name])} examples with {sum([len(ex['labels']) for ex in datasets[dataset_name]])} labels to {dataset_name}")

Added 100 examples with 123 labels to hallucinations_100_generated_annotator_1
Added 100 examples with 118 labels to hallucinations_100_generated_annotator_2
Added 100 examples with 114 labels to hallucinations_100_generated_agreed


In [10]:
# Write out datasets as jsonl into same folders as original bioc files
output_dir = Path('/home/s_hegs02/patient_summaries_with_llms')
for dataset_name in datasets.keys():
    file_name = str(dataset_paths[dataset_name]).split('/')[-1]
    file_name = file_name.replace('.xml', '.jsonl')
    with open(output_dir / file_name, 'w') as f:
        for example in datasets[dataset_name]:
            f.write(json.dumps(example) + '\n')