In [27]:
import json
import numpy as np
import math
from collections import defaultdict
from enum import Enum
import csv
import pandas as pd

In [28]:
class Datasets(Enum):
    ALL = 0
    MIMIC_CXR = 1
    CHEXPERT = 2

In [29]:
MODELS = [
    "dygie-radgraph-base",
    "dygie-radgraph-biobert",
    "dygie-radgraph-bioclinicalbert",
    "dygie-radgraph-bluebert",
    "dygie-radgraph-pubmedbert",
]
ORIGINAL_MODEL = "dygie-radgraph-original"
RADGRAPH_CLASSES = [
    "CHAN-CON-AP",
    "CHAN-WOR",
    "CHAN-IMP",
    "CHAN-CON-RES",
    "CHAN-NC",
    "CHAN-DEV-AP",
    "CHAN-DEV-PLACE",
    "CHAN-DEV-DISA",
    "ANAT-DP",
    "OBS-DP",
    "OBS-U",
    "OBS-DA"
]
ORIGINAL_CLASSES = [
    "ANAT-DP",
    "OBS-DP",
    "OBS-U",
    "OBS-DA"
]
RADGRAPH_RELATION_CLASSES = ['modify', 'located_at', 'suggestive_of']

In [30]:
def load_graph(filename):
    graph = []
    with open(f'../../data/{filename}.json') as f:
        lines = f.readlines()
    graph = [json.loads(line) for line in lines]
    graph = {r["doc_key"]: r for r in graph}
    return graph

original_radgraph = load_graph('dygie_test_original')
radgraph_change = load_graph('dygie_test')

In [31]:
def print_metrics(metrics_dict, radgraph_classes):
    total_tps = 0
    total_fps = 0
    total_fns = 0
    macro_precision = 0
    macro_recall = 0
    macro_f1 = 0
    for radgraph_class in radgraph_classes:
        tps = np.float64(metrics_dict[radgraph_class]['tps'])
        total_tps += tps
        fps = np.float64(metrics_dict[radgraph_class]['fps'])
        total_fps += fps
        fns = np.float64(metrics_dict[radgraph_class]['fns'])
        total_fns += fns
        total_actual = metrics_dict[radgraph_class]['total_actual']
        total_predicted = metrics_dict[radgraph_class]['total_predicted']
        precision = tps / (tps + fps)
        macro_precision += np.nan_to_num(precision, nan=0)
        recall = tps / (tps + fns)
        macro_recall += np.nan_to_num(recall, nan=0)
        f1 = 2 * precision * recall / (precision + recall)
        macro_f1 += np.nan_to_num(f1, nan=0)
        print(f"* Class {radgraph_class}")
        print(f"  - Precision: {precision}")
        print(f"  - Recall: {recall}")
        print(f"  - F1: {f1}")
        print(f"  - Total actual: {total_actual}")
        print(f"  - Total predicted: {total_predicted}")
    micro_precision = total_tps / (total_tps + total_fps)
    micro_recall = total_tps / (total_tps + total_fns)
    micro_f1 = 2 * micro_precision * micro_recall / (micro_precision + micro_recall)
    macro_precision /= len(radgraph_classes)
    macro_recall /= len(radgraph_classes)
    macro_f1 /= len(radgraph_classes)
    print(f"* Micro precision: {micro_precision}")
    print(f"* Micro recall: {micro_recall}")
    print(f"* Micro F1: {micro_f1}")
    print(f"* Macro precision: {macro_precision}")
    print(f"* Macro recall: {macro_recall}")
    print(f"* Macro F1: {macro_f1}")

In [32]:
def to_tuples_set(array):
    return {tuple(e) for e in array}

def add_statistics(metrics_dict, radgraph_class, ner_labels_of_class, ner_predictions_of_class):
    metrics_dict[radgraph_class]['tps'] += len(ner_labels_of_class & ner_predictions_of_class)
    metrics_dict[radgraph_class]['fps'] += len(ner_predictions_of_class - ner_labels_of_class)
    metrics_dict[radgraph_class]['fns'] += len(ner_labels_of_class - ner_predictions_of_class)
    metrics_dict[radgraph_class]['total_actual'] += len(ner_labels_of_class)
    metrics_dict[radgraph_class]['total_predicted'] += len(ner_predictions_of_class)

def evaluate(model_name, evaluate_original=False, model_original=False, dataset=Datasets.ALL):
    print(f"———————————————————[ Evaluating {model_name} ]———————————————————")
    with open(f'../../models/{model_name}/test_predictions.jsonl', 'r') as f:
        lines = f.readlines()
    model_predictions = [json.loads(line) for line in lines]
    
    ner_metrics_dict = defaultdict(lambda: {'tps': 0, 'fps': 0, 'fns': 0, 'total_actual': 0, 'total_predicted': 0})
    relations_metrics_dict = defaultdict(lambda: {'tps': 0, 'fps': 0, 'fns': 0, 'total_actual': 0, 'total_predicted': 0})
    
    for data_sample in model_predictions:
        if dataset is not Datasets.ALL:
            if dataset is Datasets.MIMIC_CXR and "/" not in data_sample["doc_key"]:
                continue
            elif dataset is Datasets.CHEXPERT and "/" in data_sample["doc_key"]:
                continue
        
        if evaluate_original:
            ner_classes = ORIGINAL_CLASSES
            
            change_ner_labels = to_tuples_set(radgraph_change[data_sample["doc_key"]]['ner'][0])
            original_ner_labels = to_tuples_set(original_radgraph[data_sample["doc_key"]]['ner'][0])
            ner_labels = change_ner_labels & original_ner_labels
            if not model_original:
                ignored_ner = change_ner_labels - ner_labels
            else:
                ignored_ner = original_ner_labels - ner_labels
                
            change_relation_labels = to_tuples_set(radgraph_change[data_sample["doc_key"]]['relations'][0])
            original_relation_labels = to_tuples_set(original_radgraph[data_sample["doc_key"]]['relations'][0])
            relation_labels = change_relation_labels & original_relation_labels
            if not model_original:
                ignored_relations = change_relation_labels - relation_labels
            else:
                ignored_relations = original_ner_labels - relation_labels
        else:
            ner_classes = RADGRAPH_CLASSES
            ner_labels = to_tuples_set(data_sample['ner'][0])
            ignored_ner = set()
            relation_labels = to_tuples_set(data_sample['relations'][0])
            ignored_relations = set()

        ner_predictions = to_tuples_set([prediction[:3] for prediction in data_sample['predicted_ner'][0]])
        relation_predictions = to_tuples_set([prediction[:5] for prediction in data_sample['predicted_relations'][0]])
        
        for radgraph_class in ner_classes:
            ner_labels_of_class = {l for l in ner_labels if l[2] == radgraph_class}
            ner_predictions_of_class = {p for p in ner_predictions if p[2] == radgraph_class and p not in ignored_ner}
            add_statistics(ner_metrics_dict, radgraph_class, ner_labels_of_class, ner_predictions_of_class)
            
        for radgraph_class in RADGRAPH_RELATION_CLASSES:
            relation_labels_of_class = {l for l in relation_labels if l[4] == radgraph_class}
            relation_predictions_of_class = {p for p in relation_predictions if p[4] == radgraph_class and p not in ignored_relations}
            add_statistics(relations_metrics_dict, radgraph_class, relation_labels_of_class, relation_predictions_of_class)
       
    print("NER results")
    print_metrics(ner_metrics_dict, ner_classes)
    print()
    print("Relations results")
    print_metrics(relations_metrics_dict, RADGRAPH_RELATION_CLASSES)
    print()
    print()

## General evaluation results

In [33]:
for model in MODELS:
    evaluate(model)

———————————————————[ Evaluating dygie-radgraph-base ]———————————————————
NER results
* Class CHAN-CON-AP
  - Precision: 1.0
  - Recall: 0.6666666666666666
  - F1: 0.8
  - Total actual: 6
  - Total predicted: 4
* Class CHAN-WOR
  - Precision: 0.7037037037037037
  - Recall: 0.9047619047619048
  - F1: 0.7916666666666667
  - Total actual: 21
  - Total predicted: 27
* Class CHAN-IMP
  - Precision: 1.0
  - Recall: 0.8181818181818182
  - F1: 0.9
  - Total actual: 11
  - Total predicted: 9
* Class CHAN-CON-RES
  - Precision: 0.3333333333333333
  - Recall: 1.0
  - F1: 0.5
  - Total actual: 1
  - Total predicted: 3
* Class CHAN-NC
  - Precision: 0.9243697478991597
  - Recall: 0.8527131782945736
  - F1: 0.8870967741935484
  - Total actual: 129
  - Total predicted: 119
* Class CHAN-DEV-AP
  - Precision: 1.0
  - Recall: 0.4
  - F1: 0.5714285714285715
  - Total actual: 5
  - Total predicted: 2
* Class CHAN-DEV-PLACE
  - Precision: 1.0
  - Recall: 1.0
  - F1: 1.0
  - Total actual: 1
  - Total predict



## Head-to-head comparison

In [40]:
evaluate("dygie-radgraph-bioclinicalbert", evaluate_original=True, dataset=Datasets.MIMIC_CXR)

———————————————————[ Evaluating dygie-radgraph-bioclinicalbert ]———————————————————
NER results
* Class ANAT-DP
  - Precision: 0.955637707948244
  - Recall: 0.9773156899810964
  - F1: 0.9663551401869158
  - Total actual: 529
  - Total predicted: 541
* Class OBS-DP
  - Precision: 0.9006928406466512
  - Recall: 0.8986175115207373
  - F1: 0.8996539792387543
  - Total actual: 434
  - Total predicted: 433
* Class OBS-U
  - Precision: 0.725
  - Recall: 0.7073170731707317
  - F1: 0.7160493827160495
  - Total actual: 41
  - Total predicted: 40
* Class OBS-DA
  - Precision: 0.9426229508196722
  - Recall: 0.9623430962343096
  - F1: 0.9523809523809523
  - Total actual: 239
  - Total predicted: 244
* Micro precision: 0.9268680445151033
* Micro recall: 0.9380530973451328
* Micro F1: 0.9324270291883247
* Macro precision: 0.8809883748536418
* Macro recall: 0.8863983427267187
* Macro F1: 0.883609863630668

Relations results
* Class modify
  - Precision: 0.8271844660194175
  - Recall: 0.814531548757170

In [43]:
evaluate(ORIGINAL_MODEL, evaluate_original=True, model_original=True, dataset=Datasets.MIMIC_CXR)

———————————————————[ Evaluating dygie-radgraph-original ]———————————————————
NER results
* Class ANAT-DP
  - Precision: 0.9574074074074074
  - Recall: 0.9773156899810964
  - F1: 0.9672591206735266
  - Total actual: 529
  - Total predicted: 540
* Class OBS-DP
  - Precision: 0.9207459207459208
  - Recall: 0.9101382488479263
  - F1: 0.9154113557358053
  - Total actual: 434
  - Total predicted: 429
* Class OBS-U
  - Precision: 0.7297297297297297
  - Recall: 0.6585365853658537
  - F1: 0.6923076923076923
  - Total actual: 41
  - Total predicted: 37
* Class OBS-DA
  - Precision: 0.9251968503937008
  - Recall: 0.9832635983263598
  - F1: 0.9533468559837729
  - Total actual: 239
  - Total predicted: 254
* Micro precision: 0.9317460317460318
* Micro recall: 0.9444891391794047
* Micro F1: 0.9380743108270078
* Macro precision: 0.8832699770691897
* Macro recall: 0.8823135306303092
* Macro F1: 0.8820812561751993

Relations results
* Class modify
  - Precision: 0.8323809523809523
  - Recall: 0.8355640

In [41]:
evaluate("dygie-radgraph-bioclinicalbert", evaluate_original=True, dataset=Datasets.CHEXPERT)

———————————————————[ Evaluating dygie-radgraph-bioclinicalbert ]———————————————————
NER results
* Class ANAT-DP
  - Precision: 0.9477848101265823
  - Recall: 0.9344773790951638
  - F1: 0.9410840534171249
  - Total actual: 641
  - Total predicted: 632
* Class OBS-DP
  - Precision: 0.9
  - Recall: 0.861878453038674
  - F1: 0.8805268109125118
  - Total actual: 543
  - Total predicted: 520
* Class OBS-U
  - Precision: 0.6440677966101694
  - Recall: 0.8085106382978723
  - F1: 0.7169811320754716
  - Total actual: 47
  - Total predicted: 59
* Class OBS-DA
  - Precision: 0.950920245398773
  - Recall: 0.9281437125748503
  - F1: 0.9393939393939393
  - Total actual: 167
  - Total predicted: 163
* Micro precision: 0.9170305676855895
* Micro recall: 0.9012875536480687
* Micro F1: 0.9090909090909091
* Macro precision: 0.8606932130338811
* Macro recall: 0.8832525457516401
* Macro F1: 0.869496483949762

Relations results
* Class modify
  - Precision: 0.7589424572317263
  - Recall: 0.6863572433192686
 

In [42]:
evaluate(ORIGINAL_MODEL, evaluate_original=True, model_original=True, dataset=Datasets.CHEXPERT)

———————————————————[ Evaluating dygie-radgraph-original ]———————————————————
NER results
* Class ANAT-DP
  - Precision: 0.9416403785488959
  - Recall: 0.9313572542901716
  - F1: 0.9364705882352942
  - Total actual: 641
  - Total predicted: 634
* Class OBS-DP
  - Precision: 0.851985559566787
  - Recall: 0.8692449355432781
  - F1: 0.8605287146763901
  - Total actual: 543
  - Total predicted: 554
* Class OBS-U
  - Precision: 0.6129032258064516
  - Recall: 0.8085106382978723
  - F1: 0.6972477064220183
  - Total actual: 47
  - Total predicted: 62
* Class OBS-DA
  - Precision: 0.9325153374233128
  - Recall: 0.9101796407185628
  - F1: 0.9212121212121211
  - Total actual: 167
  - Total predicted: 163
* Micro precision: 0.8910120311394196
* Micro recall: 0.9005722460658083
* Micro F1: 0.8957666310921379
* Macro precision: 0.8347611253363618
* Macro recall: 0.8798231172124712
* Macro F1: 0.8538647826364558

Relations results
* Class modify
  - Precision: 0.7380239520958084
  - Recall: 0.69338959

## MIMIC-CXR Only

In [38]:
for model in MODELS:
    evaluate(model, dataset=Datasets.MIMIC_CXR)

———————————————————[ Evaluating dygie-radgraph-base ]———————————————————
NER results
* Class CHAN-CON-AP
  - Precision: 1.0
  - Recall: 0.8
  - F1: 0.888888888888889
  - Total actual: 5
  - Total predicted: 4
* Class CHAN-WOR
  - Precision: 0.42857142857142855
  - Recall: 0.75
  - F1: 0.5454545454545454
  - Total actual: 4
  - Total predicted: 7
* Class CHAN-IMP
  - Precision: 1.0
  - Recall: 0.6666666666666666
  - F1: 0.8
  - Total actual: 6
  - Total predicted: 4
* Class CHAN-CON-RES
  - Precision: 0.3333333333333333
  - Recall: 1.0
  - F1: 0.5
  - Total actual: 1
  - Total predicted: 3
* Class CHAN-NC
  - Precision: 0.9137931034482759
  - Recall: 0.8833333333333333
  - F1: 0.8983050847457628
  - Total actual: 60
  - Total predicted: 58
* Class CHAN-DEV-AP
  - Precision: nan
  - Recall: nan
  - F1: nan
  - Total actual: 0
  - Total predicted: 0
* Class CHAN-DEV-PLACE
  - Precision: nan
  - Recall: nan
  - F1: nan
  - Total actual: 0
  - Total predicted: 0
* Class CHAN-DEV-DISA
  - Pr



## CheXpert Only

In [39]:
for model in MODELS:
    evaluate(model, dataset=Datasets.CHEXPERT)

———————————————————[ Evaluating dygie-radgraph-base ]———————————————————
NER results
* Class CHAN-CON-AP
  - Precision: nan
  - Recall: 0.0
  - F1: nan
  - Total actual: 1
  - Total predicted: 0
* Class CHAN-WOR
  - Precision: 0.8
  - Recall: 0.9411764705882353
  - F1: 0.8648648648648648
  - Total actual: 17
  - Total predicted: 20
* Class CHAN-IMP
  - Precision: 1.0
  - Recall: 1.0
  - F1: 1.0
  - Total actual: 5
  - Total predicted: 5
* Class CHAN-CON-RES
  - Precision: nan
  - Recall: nan
  - F1: nan
  - Total actual: 0
  - Total predicted: 0
* Class CHAN-NC
  - Precision: 0.9344262295081968
  - Recall: 0.8260869565217391
  - F1: 0.8769230769230769
  - Total actual: 69
  - Total predicted: 61
* Class CHAN-DEV-AP
  - Precision: 1.0
  - Recall: 0.4
  - F1: 0.5714285714285715
  - Total actual: 5
  - Total predicted: 2
* Class CHAN-DEV-PLACE
  - Precision: 1.0
  - Recall: 1.0
  - F1: 1.0
  - Total actual: 1
  - Total predicted: 1
* Class CHAN-DEV-DISA
  - Precision: 1.0
  - Recall: 1.0




  - Recall: 0.6190476190476191
  - F1: 0.7222222222222222
  - Total actual: 63
  - Total predicted: 45
* Micro precision: 0.731638418079096
* Micro recall: 0.6943699731903485
* Micro F1: 0.7125171939477304
* Macro precision: 0.7745182735798939
* Macro recall: 0.6820248820248821
* Macro F1: 0.7211422568024756


———————————————————[ Evaluating dygie-radgraph-bioclinicalbert ]———————————————————
NER results
* Class CHAN-CON-AP
  - Precision: nan
  - Recall: 0.0
  - F1: nan
  - Total actual: 1
  - Total predicted: 0
* Class CHAN-WOR
  - Precision: 0.8095238095238095
  - Recall: 1.0
  - F1: 0.8947368421052632
  - Total actual: 17
  - Total predicted: 21
* Class CHAN-IMP
  - Precision: 1.0
  - Recall: 1.0
  - F1: 1.0
  - Total actual: 5
  - Total predicted: 5
* Class CHAN-CON-RES
  - Precision: nan
  - Recall: nan
  - F1: nan
  - Total actual: 0
  - Total predicted: 0
* Class CHAN-NC
  - Precision: 0.9242424242424242
  - Recall: 0.8840579710144928
  - F1: 0.9037037037037037
  - Total actual: