#### Code which runs 2 NER models, then combines the entities from each into a placeholder Doc for visualisation. Combination of entities allows for more generalised NER

In [None]:
import spacy
from spacy import displacy
from spacy.tokens import Span
from spacy.tokens import Doc
from scispacy.abbreviation import AbbreviationDetector
from scispacy.linking import EntityLinker
import xml.etree.ElementTree as ET

# Load the general model
nlp_general = spacy.load('en_core_web_md')

# Load the biomedical model
nlp_biomedical = spacy.load('en_ner_bc5cdr_md')
nlp_biomedical.add_pipe("abbreviation_detector")
nlp_biomedical.add_pipe("scispacy_linker", config={"resolve_abbreviations": True, "linker_name": "umls"})

# Parse the XML file
tree = ET.parse('data/n2c2/108.xml')

# Get the root of the XML document
root = tree.getroot()

# Extract everything inside the TEXT tag
text = root.find('TEXT').text

# Process the text with each model
doc_general = nlp_general(text)
doc_biomedical = nlp_biomedical(text)

# Extract entities from each model's output
entities_general = [{"start": ent.start, "end": ent.end, "label": ent.label} for ent in doc_general.ents]
entities_biomedical = [{"start": ent.start, "end": ent.end, "label": ent.label} for ent in doc_biomedical.ents]

# Combine entities
combined_entities = entities_general + entities_biomedical

# Sort entities by start position
combined_entities.sort(key=lambda x: x['start'])

# Resolve overlaps, prioritizing entities from the biomedical model
resolved_entities = []
for entity in combined_entities:
    if resolved_entities and entity['start'] < resolved_entities[-1]['end']:
        # This entity overlaps with the previous one. If it's from the biomedical model, replace the previous one.
        if entity in entities_biomedical:
            resolved_entities[-1] = entity
    else:
        resolved_entities.append(entity)

# Create a new Doc object from the original Doc object's words
words = [token.text for token in doc_general]
spaces = [token.whitespace_ for token in doc_general]
doc_combined = Doc(doc_general.vocab, words=words, spaces=spaces)

# Add the resolved entities to the new Doc object
for ent in resolved_entities:
    span = Span(doc_combined, ent["start"], ent["end"], label=ent["label"])
    doc_combined.ents = list(doc_combined.ents) + [span]

# Visualize combined entities
displacy.render(doc_combined, style="ent")