#### Code which returns the top 5 most similar sentences to the user input based on their similarity score, using the a scispacy model

In [None]:
import spacy
from spacy.lang.en import English
from spacy import displacy
from scispacy.abbreviation import AbbreviationDetector
import xml.etree.ElementTree as ET

In [None]:
# Use a medically-trained model on the data
nlp = spacy.load('en_core_sci_md')

# Add the medical abbreviation detector
nlp.add_pipe("abbreviation_detector")

In [None]:
# Parse the XML file
tree = ET.parse('data/n2c2/108.xml')

# Get the root of the XML document
root = tree.getroot()

# Extract everything inside the TEXT tag
text = root.find('TEXT').text

# Process the medical text
doc = nlp(text)

# Print entity types
# NB non NER-specific scispacy models do not differentiate between entitity types
for ent in doc.ents:
    print(ent, ent.label_)

In [None]:
# Get some user input to compare with the document
# Hard coded for simplicity
user_input = nlp("right knee pain and swelling")

results = []

# Compare the meaning of the user input with the sentences in the document
for sentence in doc.sents:

    similarity_score = sentence.similarity(user_input)

    results.append((similarity_score, sentence))

# Sort the sentences by similarity score
results.sort(key=lambda x: x[0], reverse=True)

# Print the top 5 most similar sentences
for i in range(5):
    print("\n***** Entry " + str(i) + " ***** score: " + str(results[i][0]) + " *****\n")
    print(results[i][1])
    print("\n")