#### Code which combines ideas from the previous prototypes, and uses a model trained on manually tagged data rather than a generic spaCy/scispaCy model. Includes scispacy entity linker and abbreviation detector

Using a premade python script to preprocess the n2c2 files. Preprocessing script is taken from weasel tutorials, see https://github.com/explosion/projects/blob/v3/tutorials/ner_pytorch_medical/scripts/preprocess.py

- Use preprocess file from weasel, however do the training manually using spacy commands rather than wrapper commands
- Ignore pytorch stuff and use a more simple model instead
- Follow spacy tutorial for training 

In [None]:
'''
Example uses:
- Extract all the medications that a patient is taking
- Extract all the symptoms that a patient is experiencing
- Extract all the diseases that a patient has
- Extract all the procedures that a patient has undergone

- Perform sentiment analysis on a patient's EHR to give an overall idea of the patient's health

- Allow filtering of the data based on the disease that they are related to or the date that they were recorded


Conforming to the existing tags in the n2c2 dataset:

- Extract all the people that are mentioned in the document
- Extract all the problems that are mentioned in the document
- Extract all the tests that are mentioned in the document
- Extract all the treatments that are mentioned in the document

- Perform sentiment analysis on a patient's EHR to give an overall idea of the patient's health
'''

import spacy
from spacy import displacy
from spacy.tokens import Span
from spacy.tokens import Doc
from scispacy.abbreviation import AbbreviationDetector
from scispacy.linking import EntityLinker
import numpy as np

# Uses textblob from NLTK
from spacytextblob.spacytextblob import SpacyTextBlob
import xml.etree.ElementTree as ET

# Load the newly trained model
nlp = spacy.load("output/model-best/")

# Sentencizer for similarity scores
nlp.add_pipe("sentencizer")

# Textblob for sentiment analysis
nlp.add_pipe("spacytextblob")

# Add abbreviation pipe and entity linker to the pipeline
nlp.add_pipe("abbreviation_detector")

# Take user input for the file name
# Set up for parsing XML from n2c2
path = input("Enter the path to the file to process: ")
tree = ET.parse(path)
root = tree.getroot()
text = root.find('TEXT').text

# Process the given EHR
doc = nlp(text)


In [None]:
# Perform NER
def extract_entities(doc, entity):

    entities = []
    for ent in doc.ents:
        if ent.label_ == entity:
            entities.append(ent.text)

    print("\n" + entity.capitalize() + "(s): \n")

    # Sort entities alphabetically
    entities.sort()

    for item in entities:
        print(item)

    print("\n")

    return

# Use spacytextblob for sentiment analysis
def sentiment_analysis(doc):
    print("\n")
    print("Polarity: " + str(doc._.blob.polarity) + "\n")
    print("Subjectivity: " + str(doc._.blob.subjectivity) +" \n")

    return

# Return the top 5 most similar sentences to the user input
def top_5_sentences(doc):

    user_input = nlp(input("\nEnter a sentence to compare with the document: "))

    results = []

    # Compare the meaning of the user input with the sentences in the document
    for sentence in doc.sents:

        similarity_score = sentence.similarity(user_input)

        results.append((similarity_score, sentence))

    # Sort the sentences by similarity score
    results.sort(key=lambda x: x[0], reverse=True)

    # Print the top 5 most similar sentences
    for i in range(5):
        print("\n***** Entry " + str(i) + " ***** score: " + str(results[i][0]) + " *****\n")
        print(results[i][1])
        print("\n")

    return

In [None]:
# Handle printing and calling the functions based on user input
while(True):

    print("***************\n")

    print("Available operations:\n")
    print("1. Extract people from document\n")
    print("2. Extract problems/symptoms from document\n")
    print("3. Extract tests from document\n")
    print("4. Extract treatments from document\n")
    print("5. Perform sentiment analysis on document\n")
    print("6. List the top 5 sentences most similar to input\n")
    print("7. Filter data based on disease or date\n")
    print("8. Exit\n")

    print("***************\n")

    operation = input("Select operation to perform: ")

    try:

        operation = int(operation)

    except:

        print("Invalid input, try again \n")

        continue

    entity_types = ['person', 'problem', 'test', 'treatment']

    if operation >= 1 and operation <= 4:
        extract_entities(doc, entity_types[operation - 1])

    elif operation == 5:
        sentiment_analysis(doc)

    elif operation == 6:
        top_5_sentences(doc)

    elif operation == 7:
    
        pass

    elif operation == 8:

        exit(0)

    run_again = input("Run again? (y/n)\n")

    if run_again == "n":
        
        exit(0)

    else:

        continue

In [None]:
# Converter for use of command line tools
!jupyter nbconvert --to script prototype5.ipynb