In [None]:
import spacy
import scispacy

from scispacy.linking import EntityLinker
import json

nlp = spacy.load("en_core_sci_md")

# This line takes a while, because we have to download ~1GB of data
# and load a large JSON file (the knowledge base). Be patient!
# Thankfully it should be faster after the first time you use it, because
# the downloads are cached.
# NOTE: The resolve_abbreviations parameter is optional, and requires that
# the AbbreviationDetector pipe has already been added to the pipeline. Adding
# the AbbreviationDetector pipe and setting resolve_abbreviations to True means
# that linking will only be performed on the long form of abbreviations.
nlp.add_pipe("scispacy_linker", config={"resolve_abbreviations": True, "linker_name": "umls"})

In [None]:
data_test = []
with open("test.txt", 'r') as input:
    for jsonObj in input:
        patientDict = json.loads(jsonObj)
        patientDict["1_medical"] = []
        patientDict["2_medical"] = []
        doc = nlp(patientDict["summary"])
        for ent in doc.ents:
            linker = nlp.get_pipe("scispacy_linker")
            name = ent.text.lower()
            if name in ["information", "treatment", "treatments"] or len(ent._.kb_ents) == 0:
                continue
            
            if linker.kb.cui_to_entity[ent._.kb_ents[0][0]][1].lower() == name:
                if name in patientDict["question"].lower() or singularize(name) in patientDict["question"].lower():
                    patientDict["2_medical"].append(ent.text)
                else:
                    patientDict["1_medical"].append(ent.text)
            else:
                aliases_set = set()
                for e in linker.kb.cui_to_entity[ent._.kb_ents[0][0]].aliases:
                    aliases_set.add(e.lower())
                
                if name in aliases_set:
                    if name in patientDict["question"].lower() or singularize(name) in patientDict["question"].lower():
                        patientDict["2_medical"].append(ent.text)
                    else:
                        patientDict["1_medical"].append(ent.text)
        
        data_test.append(patientDict)

In [None]:
data_train = []
with open("train.txt", 'r') as input:
    for jsonObj in input:
        patientDict = json.loads(jsonObj)
        patientDict["1_medical"] = []
        patientDict["2_medical"] = []
        doc = nlp(patientDict["summary"])
        for ent in doc.ents:
            linker = nlp.get_pipe("scispacy_linker")
            name = ent.text.lower()
            if name in ["information", "treatment", "treatments"] or len(ent._.kb_ents) == 0:
                continue
            
            if linker.kb.cui_to_entity[ent._.kb_ents[0][0]][1].lower() == name:
                if name in patientDict["question"].lower() or singularize(name) in patientDict["question"].lower():
                    patientDict["2_medical"].append(ent.text)
                else:
                    patientDict["1_medical"].append(ent.text)
            else:
                aliases_set = set()
                for e in linker.kb.cui_to_entity[ent._.kb_ents[0][0]].aliases:
                    aliases_set.add(e.lower())
                
                if name in aliases_set:
                    if name in patientDict["question"].lower() or singularize(name) in patientDict["question"].lower():
                        patientDict["2_medical"].append(ent.text)
                    else:
                        patientDict["1_medical"].append(ent.text)
        
        data_train.append(patientDict)

In [None]:
data_validation = []
with open("validation.txt", 'r') as input:
    for jsonObj in input:
        patientDict = json.loads(jsonObj)
        patientDict["1_medical"] = []
        patientDict["2_medical"] = []
        doc = nlp(patientDict["summary"])
        for ent in doc.ents:
            linker = nlp.get_pipe("scispacy_linker")
            name = ent.text.lower()
            if name in ["information", "treatment", "treatments"] or len(ent._.kb_ents) == 0:
                continue
            
            if linker.kb.cui_to_entity[ent._.kb_ents[0][0]][1].lower() == name:
                if name in patientDict["question"].lower() or singularize(name) in patientDict["question"].lower():
                    patientDict["2_medical"].append(ent.text)
                else:
                    patientDict["1_medical"].append(ent.text)
            else:
                aliases_set = set()
                for e in linker.kb.cui_to_entity[ent._.kb_ents[0][0]].aliases:
                    aliases_set.add(e.lower())
                
                if name in aliases_set:
                    if name in patientDict["question"].lower() or singularize(name) in patientDict["question"].lower():
                        patientDict["2_medical"].append(ent.text)
                    else:
                        patientDict["1_medical"].append(ent.text)
        
        data_validation.append(patientDict)

In [12]:
with open('HQS_dataset/train.txt', 'w') as file:
    for e in data_train:
        file.write(json.dumps(e, ensure_ascii=False))
        file.write('\n')
        
with open('HQS_dataset/validation.txt', 'w') as file:
    for e in data_validation:
        file.write(json.dumps(e, ensure_ascii=False))
        file.write('\n')
        
with open('HQS_dataset/test.txt', 'w') as file:
    for e in data_test:
        file.write(json.dumps(e, ensure_ascii=False))
        file.write('\n')