In [None]:
import spacy
import scispacy
import json
import ast

from scispacy.linking import EntityLinker

nlp = spacy.load("en_core_sci_md")

# This line takes a while, because we have to download ~1GB of data
# and load a large JSON file (the knowledge base). Be patient!
# Thankfully it should be faster after the first time you use it, because
# the downloads are cached.
# NOTE: The resolve_abbreviations parameter is optional, and requires that
# the AbbreviationDetector pipe has already been added to the pipeline. Adding
# the AbbreviationDetector pipe and setting resolve_abbreviations to True means
# that linking will only be performed on the long form of abbreviations.
nlp.add_pipe("scispacy_linker", config={"resolve_abbreviations": True, "linker_name": "umls"})

In [None]:
# for indiana test split

data_test_indiana = []
with open('Radiology_dataset/MEDIQA2021_RRS_Test_Set_Full.json', 'r') as f:
    mylist = ast.literal_eval(f.read())
    for list_element in mylist:
        if list_element["split"] == "stanford":
            continue
        
        patientDict = {}
        patientDict["1_medical"] = []
        patientDict["2_medical"] = []
        patientDict["neg_uni"] = []
        patientDict["summary"] = list_element['impression']
        patientDict["text"] = list_element['findings'] + " " + "[SEP]" + " " + list_element['background']
        
        doc = nlp(patientDict["summary"])
        for ent in doc.ents:
            linker = nlp.get_pipe("scispacy_linker")
            name = ent.text.lower()
            if len(ent._.kb_ents) == 0:
                continue
            
            if linker.kb.cui_to_entity[ent._.kb_ents[0][0]][1].lower() == name:
                if ent.text in patientDict["text"]:
                    patientDict["2_medical"].append(ent.text)
                else:
                    patientDict["1_medical"].append(ent.text)
            else:
                aliases_set = set()
                for e in linker.kb.cui_to_entity[ent._.kb_ents[0][0]].aliases:
                    aliases_set.add(e.lower())
                
                if name in aliases_set:
                    if ent.text in patientDict["text"]:
                        patientDict["2_medical"].append(ent.text)
                    else:
                        patientDict["1_medical"].append(ent.text)
        
        for i in range(len(doc)):
            if "no" == doc[i].text:
                patientDict["neg_uni"].append("no")
            if "nope" == doc[i].text:
                patientDict["neg_uni"].append("nope")
            if i < len(doc) - 1 and "does" == doc[i].text and "n't" == doc[i+1].text:
                patientDict["neg_uni"].append("doesn't")
            if i < len(doc) - 1 and "do" == doc[i].text and "n't" == doc[i+1].text:
                patientDict["neg_uni"].append("don't")
            if "not" == doc[i].text:
                patientDict["neg_uni"].append("not")
        data_test_indiana.append(patientDict)
        


In [None]:
data_test = []
with open('Radiology_dataset/Task3/MEDIQA2021-TestSets-with-GroundTruth-Sept-2-2021-3/Task3_RRS/MEDIQA2021_RRS_Test_Set_Full.json', 'r') as f:
    mylist = ast.literal_eval(f.read())
    for list_element in mylist:
        if list_element["split"] == "indiana":
            continue
        
        patientDict = {}
        patientDict["1_medical"] = []
        patientDict["2_medical"] = []
        patientDict["neg_uni"] = []
        patientDict["summary"] = list_element['impression']
        patientDict["text"] = list_element['findings'] + " " + "[SEP]" + " " + list_element['background']
        
        doc = nlp(patientDict["summary"])
        for ent in doc.ents:
            linker = nlp.get_pipe("scispacy_linker")
            name = ent.text.lower()
            if len(ent._.kb_ents) == 0:
                continue
            
            if linker.kb.cui_to_entity[ent._.kb_ents[0][0]][1].lower() == name:
                if ent.text in patientDict["text"]:
                    patientDict["2_medical"].append(ent.text)
                else:
                    patientDict["1_medical"].append(ent.text)
            else:
                aliases_set = set()
                for e in linker.kb.cui_to_entity[ent._.kb_ents[0][0]].aliases:
                    aliases_set.add(e.lower())
                
                if name in aliases_set:
                    if ent.text in patientDict["text"]:
                        patientDict["2_medical"].append(ent.text)
                    else:
                        patientDict["1_medical"].append(ent.text)
        
        for i in range(len(doc)):
            if "no" == doc[i].text:
                patientDict["neg_uni"].append("no")
            if "nope" == doc[i].text:
                patientDict["neg_uni"].append("nope")
            if i < len(doc) - 1 and "does" == doc[i].text and "n't" == doc[i+1].text:
                patientDict["neg_uni"].append("doesn't")
            if i < len(doc) - 1 and "do" == doc[i].text and "n't" == doc[i+1].text:
                patientDict["neg_uni"].append("don't")
            if "not" == doc[i].text:
                patientDict["neg_uni"].append("not")
        data_test.append(patientDict)
        

In [None]:
data_train = []
with open('Radiology_dataset/Task3/train.json', 'r') as f:
    mylist = ast.literal_eval(f.read())
    for list_element in mylist:
#         if list_element["split"] == "indiana":
#             continue
        
        patientDict = {}
        patientDict["1_medical"] = []
        patientDict["2_medical"] = []
        patientDict["neg_uni"] = []
        patientDict["summary"] = list_element['impression']
        patientDict["text"] = list_element['findings'] + " " + "[SEP]" + " " + list_element['background']
        
        doc = nlp(patientDict["summary"])
        for ent in doc.ents:
            linker = nlp.get_pipe("scispacy_linker")
            name = ent.text.lower()
            if len(ent._.kb_ents) == 0:
                continue
            
            if linker.kb.cui_to_entity[ent._.kb_ents[0][0]][1].lower() == name:
                if ent.text in patientDict["text"]:
                    patientDict["2_medical"].append(ent.text)
                else:
                    patientDict["1_medical"].append(ent.text)
            else:
                aliases_set = set()
                for e in linker.kb.cui_to_entity[ent._.kb_ents[0][0]].aliases:
                    aliases_set.add(e.lower())
                
                if name in aliases_set:
                    if ent.text in patientDict["text"]:
                        patientDict["2_medical"].append(ent.text)
                    else:
                        patientDict["1_medical"].append(ent.text)
        
        for i in range(len(doc)):
            if "no" == doc[i].text:
                patientDict["neg_uni"].append("no")
            if "nope" == doc[i].text:
                patientDict["neg_uni"].append("nope")
            if i < len(doc) - 1 and "does" == doc[i].text and "n't" == doc[i+1].text:
                patientDict["neg_uni"].append("doesn't")
            if i < len(doc) - 1 and "do" == doc[i].text and "n't" == doc[i+1].text:
                patientDict["neg_uni"].append("don't")
            if "not" == doc[i].text:
                patientDict["neg_uni"].append("not")
        data_train.append(patientDict)
        

In [None]:
data_validation = []
with open('Radiology_dataset/Task3/train.json', 'r') as f:
    mylist = ast.literal_eval(f.read())
    for list_element in mylist:
#         if list_element["split"] == "indiana":
#             continue
        
        patientDict = {}
        patientDict["1_medical"] = []
        patientDict["2_medical"] = []
        patientDict["neg_uni"] = []
        patientDict["summary"] = list_element['impression']
        patientDict["text"] = list_element['findings'] + " " + "[SEP]" + " " + list_element['background']
        
        doc = nlp(patientDict["summary"])
        for ent in doc.ents:
            linker = nlp.get_pipe("scispacy_linker")
            name = ent.text.lower()
            if len(ent._.kb_ents) == 0:
                continue
            
            if linker.kb.cui_to_entity[ent._.kb_ents[0][0]][1].lower() == name:
                if ent.text in patientDict["text"]:
                    patientDict["2_medical"].append(ent.text)
                else:
                    patientDict["1_medical"].append(ent.text)
            else:
                aliases_set = set()
                for e in linker.kb.cui_to_entity[ent._.kb_ents[0][0]].aliases:
                    aliases_set.add(e.lower())
                
                if name in aliases_set:
                    if ent.text in patientDict["text"]:
                        patientDict["2_medical"].append(ent.text)
                    else:
                        patientDict["1_medical"].append(ent.text)
        
        for i in range(len(doc)):
            if "no" == doc[i].text:
                patientDict["neg_uni"].append("no")
            if "nope" == doc[i].text:
                patientDict["neg_uni"].append("nope")
            if i < len(doc) - 1 and "does" == doc[i].text and "n't" == doc[i+1].text:
                patientDict["neg_uni"].append("doesn't")
            if i < len(doc) - 1 and "do" == doc[i].text and "n't" == doc[i+1].text:
                patientDict["neg_uni"].append("don't")
            if "not" == doc[i].text:
                patientDict["neg_uni"].append("not")
        data_train.append(patientDict)
        

In [105]:
with open('RRS_dataset/train.txt', 'w') as file:
    for e in data_train:
        file.write(json.dumps(e))
        file.write('\n')
        
with open('RRS_dataset/validation.txt', 'w') as file:
    for e in data_validation:
        file.write(json.dumps(e))
        file.write('\n')
        
with open('RRS_dataset/test.txt', 'w') as file:
    for e in data_test:
        file.write(json.dumps(e))
        file.write('\n')

In [11]:
with open('RRS_dataset/test_indiana.txt', 'w') as file:
    for e in data_test_indiana:
        file.write(json.dumps(e))
        file.write('\n')