# Analyse Labelings

This notebooks is used to analyse the labeling of the dataset done by two annotators.

In [9]:
from pathlib import Path
from bioc import biocxml
import re
import pandas as pd

In [10]:
# General defintions
red_flagged_ids = [10, 18, 20, 43, 44, 48, 50, 53, 62, 66, 67, 69, 72, 75, 86, 95, 98, 100, 110, 111, 113, 115, 123, 125, 134, 137]
exluded_ids = red_flagged_ids
included_ids = list(range(0, 13))
included_ids = [x for x in included_ids if x not in exluded_ids]

In [11]:
# Label defintions
labels = {
    'c': 'condition_unsupported',
    'p': 'procedure_unsupported',
    'm': 'medication_unsupported',
    't': 'time_unsupported',
    'l': 'location_unsupported',
    'n': 'number_unsupported',
    'na': 'name_unsupported',
    'w': 'word_unsupported',
    'o': 'other_unsupported',
    'co': 'contradicted_fact',
    'i': 'incorrect_fact'
}

In [12]:
# Read labelings in BioC format
data_path = '/home/s/s_hegs02/scratch/MedTator'
data_path = Path(data_path)
labeling_1_path = data_path / '10_label_silver_examples_annotator_1' / 'labelled-dataset-BioC.xml'
labeling_2_path = data_path / '11_label_silver_examples_annotator_2' / 'labelled-dataset-BioC.xml'
with open(labeling_1_path, 'rb') as fp:
    labeling_1 = biocxml.load(fp)
with open(labeling_2_path, 'rb') as fp:
    labeling_2 = biocxml.load(fp)

In [None]:
# Create dict of document ids and their annotations
def extract_id(document_name):
    return document_name.split('.')[0].split('_')[-1]

def parse_label(annotation):
    # Create a dict of start index, end index, length, label, text
    start = annotation.locations[0].offset
    end = start + annotation.locations[0].length
    length = annotation.locations[0].length
    # Get all character before digit of annotation id
    label_prefix = str(re.findall(r'[^\d]+', annotation.id)[0])
    label = labels[label_prefix.lower()]
    text = annotation.text
    return {'start': start, 'end': end, 'length': length, 'label': label, 'text': text}

# Sort lists of dict by dict key start
def sort_by_start(l):
    return sorted(l, key=lambda k: k['start'])

labeling_1_dict = {}
for document in labeling_1.documents:
    labeling_1_dict[extract_id(document.id)] = sort_by_start([parse_label(a) for a in document.passages[0].annotations])
labeling_2_dict = {}
for document in labeling_2.documents:
    labeling_2_dict[extract_id(document.id)] = sort_by_start([parse_label(a) for a in document.passages[0].annotations])
    
# Only keep documents with ids in included_ids
labeling_1_dict = {k: v for k, v in labeling_1_dict.items() if int(k) in included_ids}
labeling_2_dict = {k: v for k, v in labeling_2_dict.items() if int(k) in included_ids}
assert labeling_1_dict.keys() == labeling_2_dict.keys()

# Create dataframe of document ids and their annotations
data_list = []
for id in labeling_1_dict.keys():
    data_list.append({'id': id, 'labels_1': labeling_1_dict[id], 'labels_2': labeling_2_dict[id]})
data = pd.DataFrame(data_list)

In [55]:
# Determine which annotations are in agreement with an overlap
overlap_ratio = 0.8

def get_overlap(a, b):
    # Get the overlap of two annotations
    return max(0, min(a['end'], b['end']) - max(a['start'], b['start'])) / min(a['length'], b['length'])
    
def get_agreement_list(a_list, b_list, same_label=False):
    # Get the agreement of two lists of annotations
    agreement_list = []
    for a in a_list:
        for b in b_list:
            # labels a and b contain start, end, length, label, text as dict
            overlap = get_overlap(a, b)
            if overlap >= overlap_ratio and (not same_label or a['label'] == b['label']):
                agreement_list.append((overlap, a, b))
    return agreement_list

def get_labels_no_agreement(labels, agreement, labeller):
    # Get the labels of a list of annotations that are not in agreement
    labels_not_in_agreement = []
    for label in labels:
        in_agreement = False
        for agreement_tuple in agreement:
            # agreement_tuple contains overlap ratio, annotation of labeller 1, annotation of labeller 2
            if label == agreement_tuple[labeller]:
                in_agreement = True
                break
        if not in_agreement:
            labels_not_in_agreement.append(label)
    return labels_not_in_agreement


data['agreement_diff'] = data.apply(lambda row: get_agreement_list(row['labels_1'], row['labels_2']), axis=1)
data['agreement_same'] = data.apply(lambda row: get_agreement_list(row['labels_1'], row['labels_2'], same_label=True), axis=1)
data['labels_1_no_agreement_diff'] = data.apply(lambda row: get_labels_no_agreement(row['labels_1'], row['agreement_diff'], 1), axis=1)
data['labels_2_no_agreement_diff'] = data.apply(lambda row: get_labels_no_agreement(row['labels_2'], row['agreement_diff'], 2), axis=1)
data['labels_1_no_agreement_same'] = data.apply(lambda row: get_labels_no_agreement(row['labels_1'], row['agreement_same'], 1), axis=1)
data['labels_2_no_agreement_same'] = data.apply(lambda row: get_labels_no_agreement(row['labels_2'], row['agreement_same'], 2), axis=1)

# Check for labeller 1 and labeller 2 that number of labels in agreement and not in agreement are the same as the total number of labels
assert data.apply(lambda row: len(row['labels_1']) == len(row['agreement_diff']) + len(row['labels_1_no_agreement_diff']), axis=1).all()
assert data.apply(lambda row: len(row['labels_2']) == len(row['agreement_diff']) + len(row['labels_2_no_agreement_diff']), axis=1).all()
assert data.apply(lambda row: len(row['labels_1']) == len(row['agreement_same']) + len(row['labels_1_no_agreement_same']), axis=1).all()
assert data.apply(lambda row: len(row['labels_2']) == len(row['agreement_same']) + len(row['labels_2_no_agreement_same']), axis=1).all()

In [56]:
# Print general statistics
total_labels_1 = data['labels_1'].apply(len).sum()
total_labels_2 = data['labels_2'].apply(len).sum()
total_agreement_diff = data['agreement_diff'].apply(len).sum()
total_agreement_same = data['agreement_same'].apply(len).sum()

print(f"Included {len(included_ids)} documents ({included_ids})")
print(f"Total labels rater 1: {total_labels_1}")
print(f"Total labels rater 2: {total_labels_2}")
print(f"Total labels in {overlap_ratio} agreement w/ diff labels: {total_agreement_diff} ({total_agreement_diff / total_labels_1 * 100:.2f}%, {total_agreement_diff / total_labels_2 * 100:.2f}%)")
print(f"Total labels in {overlap_ratio} agreement w/ same labels: {total_agreement_same} ({total_agreement_same / total_labels_1 * 100:.2f}%, {total_agreement_same / total_labels_2 * 100:.2f}%)")

Included 12 documents ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12])
Total labels rater 1: 24
Total labels rater 2: 8
Total labels in 0.8 agreement w/ diff labels: 3 (12.50%, 37.50%)
Total labels in 0.8 agreement w/ same labels: 2 (8.33%, 25.00%)


In [None]:
# Print statistics per document with the following format:
# document id
#   Rater 1: total labels rater 1 (total labels in agreement w/ diff labels, total labels in agreement w/ same labels)
#   Rater 2: total labels rater 2 (total labels in agreement w/ diff labels, total labels in agreement w/ same labels)
#   Both Raters: list of labels annotated by both raters with different labels
#   Only Rater 1: list of labels only annotated by rater 1 with different labels
#   Only Rater 2: list of labels only annotated by rater 2 with different labels

def format_label(label):
    # Label is a dict with keys start, end, length, label, text
    return f"{label['text']} {label['start']}-{label['end']} ({label['label']})"

def format_labels(labels):
    # Format as numbered list
    return '\n'.join([f"\t{i+1}. {format_label(l)}" for i, l in enumerate(labels)])

def format_agreement(agreement):
    # agreement is a tuple with overlap ratio, annotation of labeller 1, annotation of labeller 2
    text = f"{agreement[0]:.2f} {agreement[1]['text']} vs. {agreement[2]['text']} {agreement[1]['start']}-{agreement[1]['end']}/{agreement[2]['start']}-{agreement[2]['end']}"
    if agreement[1]['label'] != agreement[2]['label']:
        text += f" ({agreement[1]['label']} vs. {agreement[2]['label']})"
    return text

def format_agreements(agreements):
    # Format as numbered list
    return ('\n' if len(agreements) > 0 else '') + '\n'.join([f"\t{i+1}. {format_agreement(a)}" for i, a in enumerate(agreements)])

for index, row in data.iterrows():
    print(f"Document {row['id']}")
    print(f"  Rater 1: {len(row['labels_1'])}")
    print(f"  Rater 2: {len(row['labels_2'])}")
    print(f"  Both Raters ({len(row['agreement_diff'])}):{format_agreements(row['agreement_diff'])}")
    print(f"  Only Rater 1 ({len(row['labels_1_no_agreement_diff'])}):{format_labels(row['labels_1_no_agreement_diff'])}")
    print(f"  Only Rater 2 ({len(row['labels_2_no_agreement_diff'])}):{format_labels(row['labels_2_no_agreement_diff'])}")
    print()
    

In [42]:
# Print labels for each document that have an overlap of 80% or more
for document_id in labeling_1_dict:
    if document_id in labeling_2_dict:
        # Get annotations for document
        annotations_1 = labeling_1_dict[document_id]
        annotations_2 = labeling_2_dict[document_id]
        # Get all annotations that overlap 80% or more
        overlapping_annotations = []
        for a1 in annotations_1:
            for a2 in annotations_2:
                if a1['start'] >= a2['start'] and a1['start'] <= a2['end']:
                    overlapping_annotations.append((a1, a2))
                    break
                elif a2['start'] >= a1['start'] and a2['start'] <= a1['end']:
                    overlapping_annotations.append((a1, a2))
                    break
        # Print annotations
        for a1, a2 in overlapping_annotations:
            print(document_id, a1['text'], a1['label'], a2['label'])

001 resection of an inguinal mass procedure_unsupported condition_unsupported
