In [11]:
import scispacy
import spacy
import spacyfishing
import coreferee
from SPARQLWrapper import SPARQLWrapper, JSON
from uuid import uuid4
from datetime import datetime

coref = spacy.load("en_core_web_lg")
coref.add_pipe('coreferee')

disease = spacy.load("en_core_sci_sm")
disease.add_pipe('entityfishing', config={
        "extra_info": True,
        "api_ef_base": "http://nerd.huma-num.fr/nerd/service"
    })

sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
sparql.setReturnFormat(JSON)

In [2]:
def coref_resolution(name, text):
    final_text = ""
    doc = coref(text)
    
    for i, token in enumerate(doc):
        if token.tag_ == "PRP":
            resolution = doc._.coref_chains.resolve(doc[i])
            final_text += resolution[0].text
        else:
            final_text += token.text
        final_text += " "
    
    final_text = final_text.replace(name, "patient")
    return final_text.strip()

In [3]:
def extract_duples(folder_id, text):
    doc = disease(text)
    
    verbs = []
    
    for sent in doc.sents:
        for token in sent:
            if token.pos_ == "VERB" or token.pos_ == "AUX":
                verbs.append(token)
                
    duples = []
    
    for verb in verbs:
        children = [x for x in verb.children]
        subject = [x for x in children if x.dep_ == "nsubj"]
        object = [x for x in children if x.dep_ == "dobj" or x.dep_ == "nmod"]
        
        if len(subject) == 0 or len(object) == 0:
            continue
            
        subject = subject[0]
        object = object[0]
        
        subject_ent = None
        object_ent = None
        for ent in doc.ents:
            if subject.i in range(ent.start, ent.start + len(ent)) and not ent._.kb_qid is None:
                subject_ent = ent
            elif object.i in range(ent.start, ent.start + len(ent)) and not ent._.kb_qid is None:
                object_ent = ent
                
        if subject_ent is None or object_ent is None:
            continue
            
        if subject_ent._.kb_qid != "Q181600":
            continue
            
        sparql.setQuery("""
            PREFIX wd: <http://www.wikidata.org/entity/>
            PREFIX wdt: <http://www.wikidata.org/prop/direct/>

            SELECT ?x
            WHERE {
                wd:""" + object_ent._.kb_qid + """ wdt:P31 ?x
            }
            """
        )
        
        try:
            predicate = None
            ret = sparql.queryAndConvert()
            
            result = [r['x']['value'].split('/')[-1] for r in ret["results"]["bindings"]]

            if 'Q112965645' in result or 'Q12136' in result:
                predicate = "mp:declaredSymptom"
            elif 'Q12140' in result:
                predicate = "mp:triedMed"
                
            if not predicate is None:
                duples.append((predicate, f'wd:{object_ent._.kb_qid}'))
                
        except Exception as e:
            print(e)
            
    return duples

In [22]:
def create_query(name, ssm, folder_id, duples, doctor_id):
    
    folder_properties = '\n                '.join([f'{item[0]} {item[1]} ;' for item in duples])[:-1]
    
    return """
    PREFIX pat: <http://www.inria.org/patients/>
    PREFIX doc: <http://www.inria.org/doctors/>
    PREFIX cons: <http://www.inria.org/consultations/>
    PREFIX mc: <http://www.inria.org/entity/>
    PREFIX mp: <http://www.inria.org/property/>
    PREFIX wd: <http://www.wikidata.org/entity/>
    PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
    
    INSERT DATA {
        GRAPH <http://localhost:8082> {
            pat:""" + ssm + """
                a mc:Patient ;
                mp:name \"""" + name + """\" ;
                mp:consulted cons:""" + str(folder_id) + """ .
                
            cons:""" + str(folder_id) + """
                a mc:Consultation ;
                mp:docInCharge doc:""" + str(doctor_id) + """ ;
                mp:tookPlace \"""" + str(datetime.now()).replace(' ', 'T').split('.')[0] + """\"^^xsd:dateTime ;
                """ + folder_properties + """.
            
        }
    }"""

In [23]:
def text_2_sparql(name, ssm, text, doctor_id):
    folder_id = uuid4()
    preprocessed = coref_resolution(name, text)
    duples = extract_duples(folder_id, preprocessed)
    return create_query(name, ssm, folder_id, duples, doctor_id)

In [24]:
text = """
Johnson suffered from a fever. He also had a headache for 3 days. He also has hypocalcemia. He already tried peramivir 
"""

doctor_id = uuid4()

print(text_2_sparql("Johnson", "684656-8146516-13520", text, doctor_id))



    PREFIX pat: <http://www.inria.org/patients/>
    PREFIX doc: <http://www.inria.org/doctors/>
    PREFIX cons: <http://www.inria.org/consultations/>
    PREFIX mc: <http://www.inria.org/entity/>
    PREFIX mp: <http://www.inria.org/property/>
    PREFIX wd: <http://www.wikidata.org/entity/>
    PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
    
    INSERT DATA {
        GRAPH <http://localhost:8082> {
            pat:684656-8146516-13520
                a mc:Patient ;
                mp:name "Johnson" ;
                mp:hasFolder cons:59269e03-7b0c-4bab-bfb2-eb4bccda1408 .
                
            cons:59269e03-7b0c-4bab-bfb2-eb4bccda1408
                a mc:Consultation ;
                mp:docInCharge doc:08d3d4d6-629d-4847-a469-9d6d4bf9ab02 ;
                mp:tookPlace "2023-01-25T14:20:54"^^xsd:dateTime ;
                mp:declaredSymptom wd:Q38933 ;
                mp:declaredSymptom wd:Q86 ;
                mp:declaredSymptom wd:Q936382 ;
                mp:triedM

In [14]:
str(datetime.now()).replace(' ', 'T').split('.')[0]

'2023-01-25T14:00:40'