In [1]:
#!pip install py2neo
#!pip install neo4j
#!pip install spacy
#!pip install python-dateutil==2.1
#!pip install scispacy
#!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_sm-0.5.1.tar.gz
#!pip install crosslingual-coreference spacy-transformers==1.1.5 wikipedia neo4j

In [2]:
import requests
import json
import csv
import io
import spacy
import crosslingual_coreference
from spacy import Language, util
from spacy.tokens import Doc, Span
from transformers import pipeline
from typing import List
import re
import hashlib
#from py2neo import Graph
#from neo4j import GraphDatabase, basic_auth

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\austi\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [3]:
# Rebel component adapted from https://github.com/Babelscape/rebel 
def call_wiki_api(item):
  try:
    url = f"https://www.wikidata.org/w/api.php?action=wbsearchentities&search={item}&language=en&format=json"
    data = requests.get(url).json()
    # Return the first id (Could upgrade this in the future)
    return data['search'][0]['id']
  except:
    return 'id-less'

def extract_triplets(text):
    """
    Function to parse the generated text and extract the triplets
    """
    triplets = []
    relation, subject, relation, object_ = '', '', '', ''
    text = text.strip()
    current = 'x'
    for token in text.replace("<s>", "").replace("<pad>", "").replace("</s>", "").split():
        if token == "<triplet>":
            current = 't'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
                relation = ''
            subject = ''
        elif token == "<subj>":
            current = 's'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
            object_ = ''
        elif token == "<obj>":
            current = 'o'
            relation = ''
        else:
            if current == 't':
                subject += ' ' + token
            elif current == 's':
                object_ += ' ' + token
            elif current == 'o':
                relation += ' ' + token
    if subject != '' and relation != '' and object_ != '':
        triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})

    return triplets


@Language.factory(
    "rebel",
    requires=["doc.sents"],
    assigns=["doc._.rel"],
    default_config={
        "model_name": "Babelscape/rebel-large",
        "device": 0,
    },
)


class RebelComponent:
    def __init__(
        self,
        nlp,
        name,
        model_name: str,
        device: int,
    ):
        assert model_name is not None, ""
        self.triplet_extractor = pipeline("text2text-generation", model=model_name, tokenizer=model_name, device=device)
        self.entity_mapping = {}
        # Register custom extension on the Doc
        if not Doc.has_extension("rel"):
          Doc.set_extension("rel", default={})

    def get_wiki_id(self, item: str):
        mapping = self.entity_mapping.get(item)
        if mapping:
          return mapping
        else:
          res = call_wiki_api(item)
          self.entity_mapping[item] = res
          return res

    
    def _generate_triplets(self, sent: Span) -> List[dict]:
          output_ids = self.triplet_extractor(sent.text, return_tensors=True, return_text=False)[0]["generated_token_ids"]["output_ids"]
          extracted_text = self.triplet_extractor.tokenizer.batch_decode(output_ids[0])
          extracted_triplets = extract_triplets(extracted_text[0])
          return extracted_triplets

    def set_annotations(self, doc: Doc, triplets: List[dict]):
        for triplet in triplets:

            # Remove self-loops (relationships that start and end at the entity)
            if triplet['head'] == triplet['tail']:
                continue

            # Use regex to search for entities
            head_span = re.search(triplet["head"], doc.text)
            tail_span = re.search(triplet["tail"], doc.text)

            # Skip the relation if both head and tail entities are not present in the text
            # Sometimes the Rebel model hallucinates some entities
            if not head_span or not tail_span:
              continue

            index = hashlib.sha1("".join([triplet['head'], triplet['tail'], triplet['type']]).encode('utf-8')).hexdigest()
            if index not in doc._.rel:
                # Get wiki ids and store results
                doc._.rel[index] = {"relation": triplet["type"], "head_span": {'text': triplet['head'], 'id': self.get_wiki_id(triplet['head'])}, "tail_span": {'text': triplet['tail'], 'id': self.get_wiki_id(triplet['tail'])}}

    def __call__(self, doc: Doc) -> Doc:
        for sent in doc.sents:
            sentence_triplets = self._generate_triplets(sent)
            self.set_annotations(doc, sentence_triplets)
        return doc
    def pipe(self, stream, batch_size=128):
        """
        It takes a stream of documents, and for each document,
        it generates a list of sentence triplets,
        and then sets the annotations for each sentence in the document
        :param stream: a generator of Doc objects
        :param batch_size: The number of documents to process at a time, defaults to 128 (optional)
        """
        for docs in util.minibatch(stream, size=batch_size):
            sents = []
            for doc in docs:
                sents += doc.sents
            sentence_triplets = self._generate_triplets(sents)
            index = 0
            for doc in docs:
                n_sent = len(list(doc.sents))
                self.set_annotations(doc, sentence_triplets[index : index + n_sent])
                index += n_sent
                yield doc

In [4]:
# Collects abstracts and uses PubTator for a list of PMIDs
def SubmitPMIDList(Pmid,Format,Bioconcept):

    # json = {"Pmids": Pmid}

    json = {}

    #
    # load pmids
    #
    with io.open(Inputfile,'r',encoding="utf-8") as file_input:
        json = {"pmids": [pmid.strip() for pmid in file_input.readlines()]}
    
    if Bioconcept != "": 
        json["concepts"] = Bioconcept.split(",")
    
    print(json)
    r = requests.post("https://www.ncbi.nlm.nih.gov/research/pubtator-api/publications/export/" + Format , json = json)
    
    res = r.json()
    print(res)

    pmid = res['id']
    res0 = res['passages'][0]['annotations']
    res1 = res['passages'][1]['annotations']
    
    result = {}
    result = res0 + res1
    
    for i in result:
        i['Pmid'] = Pmid
    return result
    
    if r.status_code != 200 :
        print ("[Error]: HTTP code "+ str(r.status_code))
    else:
        return result

In [5]:
Inputfile = "./pmid"
Format = "biocjson"
Bioconcept = ""
res_json=[]

In [6]:
with io.open(Inputfile,'r',encoding="utf-8") as file_input:
    pmidlist = {"pmids": [pmid.strip() for pmid in file_input.readlines()]}
    
for i in pmidlist['pmids']:
    res_json.extend(SubmitPMIDList([i], Format, Bioconcept))

{'pmids': ['35900868']}
{'_id': '35900868|None', 'id': '35900868', 'infons': {}, 'passages': [{'infons': {'journal': 'J Clin Invest;2022Jul28. doi:10.1172/JCI161908', 'year': '2022', 'type': 'title', 'authors': 'Lone MA, Aaltonen MJ, Zidell A, Pedro HF, Morales Saute JA, Mathew S, Mohassel P, Bonnemann CG, Shoubridge EA, Hornemann T, ', 'section': 'Title'}, 'offset': 0, 'text': 'SPTLC1 variants associated with ALS produce distinct sphingolipid signatures through impaired interaction with ORMDL proteins.', 'sentences': [], 'annotations': [{'id': '2', 'infons': {'identifier': '10558', 'type': 'Gene', 'ncbi_homologene': '4681'}, 'text': 'SPTLC1', 'locations': [{'offset': 0, 'length': 6}]}, {'id': '3', 'infons': {'identifier': 'MESH:D013107', 'type': 'Chemical'}, 'text': 'sphingolipid', 'locations': [{'offset': 53, 'length': 12}]}], 'relations': []}, {'infons': {'type': 'abstract', 'section': 'Abstract'}, 'offset': 127, 'text': 'Amyotrophic lateral sclerosis (ALS) is a progressive neurodeg

In [7]:
res_json

[{'id': '2',
  'infons': {'identifier': '10558', 'type': 'Gene', 'ncbi_homologene': '4681'},
  'text': 'SPTLC1',
  'locations': [{'offset': 0, 'length': 6}],
  'Pmid': ['35900868']},
 {'id': '3',
  'infons': {'identifier': 'MESH:D013107', 'type': 'Chemical'},
  'text': 'sphingolipid',
  'locations': [{'offset': 53, 'length': 12}],
  'Pmid': ['35900868']},
 {'id': '31',
  'infons': {'identifier': 'MESH:D000690', 'type': 'Disease'},
  'text': 'Amyotrophic lateral sclerosis',
  'locations': [{'offset': 127, 'length': 29}],
  'Pmid': ['35900868']},
 {'id': '32',
  'infons': {'identifier': 'MESH:D019636', 'type': 'Disease'},
  'text': 'neurodegenerative disease',
  'locations': [{'offset': 180, 'length': 25}],
  'Pmid': ['35900868']},
 {'id': '33',
  'infons': {'identifier': '10558', 'type': 'Gene', 'ncbi_homologene': '4681'},
  'text': 'SPTLC1',
  'locations': [{'offset': 248, 'length': 6}],
  'Pmid': ['35900868']},
 {'id': '34',
  'infons': {'identifier': '189', 'type': 'Gene', 'ncbi_homo

In [8]:
# Collect "gene" and "chemical" entities from pubtator
f = open('Pubtator_entities.txt', mode='w')

[f.write(res_json[i]['text'] + '\n') for i in range(len(res_json)) if(res_json[i]['infons']['type'] == 'Gene' or res_json[i]['infons']['type'] == 'Chemical')]
f.close()
f = open('Pubtator_entities.txt', mode='r')
print(f.read())
Pub_ents = []
for i in range(len(res_json)):
  #if(res_json[i]['infons']['type'] == 'Gene' or res_json[i]['infons']['type'] == 'Chemical'):
  Pub_ents.append(res_json[i]['text'])
    
'''
for i in range(len(res_json)):
  if(res_json[i]['infons']['type'] == 'Gene' or res_json[i]['infons']['type'] == 'Chemical'):
    print(res_json[i]['text'], '\n')
'''


SPTLC1
sphingolipid
SPTLC1
serine-palmitoyltransferase
SPT
sphingolipids
SPTLC1
SPT
SPTLC1
sphingolipid
lipid
SPTLC1
HSAN1
SPT
L-alanine
serine
serine
SPTLC1
HSAN1
SPTLC1
HSAN1
1-deoxySL
serine
SPTLC1



"\nfor i in range(len(res_json)):\n  if(res_json[i]['infons']['type'] == 'Gene' or res_json[i]['infons']['type'] == 'Chemical'):\n    print(res_json[i]['text'], '\n')\n"

In [9]:
[str(x) for x in Pub_ents]
print(Pub_ents)

['SPTLC1', 'sphingolipid', 'Amyotrophic lateral sclerosis', 'neurodegenerative disease', 'SPTLC1', 'serine-palmitoyltransferase', 'SPT', 'sphingolipids', 'SPTLC1', 'SPT', 'SPTLC1', 'sphingolipid', 'lipid', 'SPTLC1', 'sensory neuropathy', 'HSAN1', '1-deoxysphingolipids (1-deoxySLs', 'SPT', 'L-alanine', 'serine', 'serine', 'SPTLC1', 'HSAN1', 'SPTLC1', 'patient', 'HSAN1', '1-deoxySL', 'serine', 'SPTLC1']


In [10]:
# Text to test models on (from abstracts cited in ReadMe)
input_text = 'Amyotrophic lateral sclerosis (ALS) is a progressive neurodegenerative disease that affects motor neurons. Mutations in the SPTLC1 subunit of serine palmitoyltransferase (SPT), which catalyzes the first step in the de novo synthesis of sphingolipids (SLs), cause childhood-onset ALS. SPTLC1-ALS variants map to a transmembrane domain that interacts with ORMDL proteins, negative regulators of SPT activity. We show that ORMDL binding to the holoenzyme complex is impaired in cells expressing pathogenic SPTLC1-ALS alleles, resulting in increased SL synthesis and a distinct lipid signature. C-terminal SPTLC1 variants cause peripheral hereditary sensory and autonomic neuropathy type 1 (HSAN1) due to the synthesis of 1-deoxysphingolipids (1-deoxySLs) that form when SPT metabolizes L-alanine instead of L-serine. Limiting L-serine availability in SPTLC1-ALS-expressing cells increased 1-deoxySL and shifted the SL profile from an ALS to an HSAN1-like signature. This effect was corroborated in an SPTLC1-ALS pedigree in which the index patient uniquely presented with an HSAN1 phenotype, increased 1-deoxySL levels, and an L-serine deficiency. These data demonstrate how pathogenic variants in different domains of SPTLC1 give rise to distinct clinical presentations that are nonetheless modifiable by substrate availability.'

In [11]:
# get entities to compare to pubtator entities
def getEntities(modelName):
    nlp = spacy.load(modelName) #instantiate scispacy model
    doc = nlp(input_text)
    return doc.ents # entities created by the model

In [12]:
# Function to instantiate a model and collect triples with Rebel and crosslingual_coreference
def getTriples(modelName):
    DEVICE = -1 # Number of the GPU, -1 if want to use CPU

    # Add coreference resolution model
    coref = spacy.load(modelName, disable=['ner', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer'])
    coref.add_pipe(
        "xx_coref", config={"chunk_size": 2500, "chunk_overlap": 2, "device": DEVICE})


    # Define rel extraction model
    rel_ext = spacy.load(modelName, disable=['ner', 'lemmatizer', 'attribute_rules', 'tagger'])
    rel_ext.add_pipe("rebel", config={
        'device':DEVICE, # Number of the GPU, -1 if want to use CPU
        'model_name':'Babelscape/rebel-large'} # Model used, will default to 'Babelscape/rebel-large' if not given
        )
    coref_text = coref(input_text)._.resolved_text

    doc = rel_ext(coref_text)

    triples = []
    for value, rel_dict in doc._.rel.items():
        
        new_dict = {}
        new_dict['head'] = rel_dict['head_span']['text']
        new_dict['rel'] = rel_dict['relation']
        new_dict['tail'] = rel_dict['tail_span']['text']
        triples.append(new_dict)

    return triples

### TODO below

In [13]:
# Get recall for named entity recognition (NER) based on PubTator entities as ground truth
def get_ner_recall(model_ents, modelName):
    model_ents = [x.text for x in model_ents]

    model_recall = 0
    
    
    #TODO: SHould this be divided by Pub_ents or?????????????????????????????????????????????????????????????????????
    
    model_recall = sum([1 for x in model_ents if x in Pub_ents])/Pub_ents.__len__()
    additional_recall = add_to_recall(model_ents)
    model_recall += additional_recall
    print('Percentage recall for ' + modelName + ': ', model_recall)

In [14]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('all-MiniLM-L6-v2')

In [20]:
# Function to add to recall percentage based on cosine similarity of terms that are not identical
# Threshold for similarity determined by abstracts cited in ReadMe and domain expertise
def add_to_recall(model_ents):
    
    #for x in range(len(model_ents)):
    #    for y in range(len(Pub_ents)):
    #        embeddings1 = model.encode(x, convert_to_tensor=True)
    #        embeddings2 = model.encode(y, convert_to_tensor=True)
    #        #Compute cosine-similarities
    #        cosine_scores = util.cos_sim(embeddings1, embeddings2)
    
    # Remove words that already exist in Pub_ents (these
    # are already accounted for)
    #model_ents = [x.text for x in model_ents]
    model_ents = [x for x in model_ents if x not in Pub_ents]
    
    #Compute embedding for both lists
    embeddings1 = model.encode(model_ents, convert_to_tensor=True)
    embeddings2 = model.encode(Pub_ents, convert_to_tensor=True)
  
    #Compute cosine-similarities
    cosine_scores = util.cos_sim(embeddings1, embeddings2)
    cosine_dict = {}
    for i in range(len(cosine_scores)):
        cosine_dict[model_ents[i]] = cosine_scores[i].tolist()
    
    # count to add to percentage correct recall
    recall_count = 0
    
    # Word and similarity number for highest number for each row (each model ent)
    for i in range(len(cosine_dict)):
        print(list(cosine_dict.keys())[i], ': ')
        print(max(list(cosine_dict.values())[i]), ' ', Pub_ents[list(cosine_dict.values())[i].index(max(list(cosine_dict.values())[i]))])
        if (max(list(cosine_dict.values())[i]) > 0.75):
            recall_count += 1
        elif(max(list(cosine_dict.values())[i]) > 0.50 and max(list(cosine_dict.values())[i]) <= 0.75):
            recall_count += max(list(cosine_dict.values())[i])
    recall_count = recall_count/len(Pub_ents)
    return recall_count
        #TODO: if best match below some value or smthn, check for entailment 
        #      from str that are in Pub_ents with algorithmia textual_entailment
        # Would this solve the problem of artificially high ner recall score 
        # due to multiple instances of the same entity being recognized >once?
        # Is this even a concern bc all models tested the same ?

In [16]:
Sci_md_ents = getEntities("en_core_sci_md")
#Sci_md_triples = getTriples("en_core_sci_md")
#print(Sci_md_triples)

In [17]:
print(Sci_md_ents)

(Amyotrophic lateral sclerosis, ALS, progressive, neurodegenerative disease, motor neurons, Mutations, SPTLC1 subunit, serine palmitoyltransferase, SPT, de novo, synthesis, sphingolipids, SLs, childhood-onset, ALS, SPTLC1-ALS, variants, transmembrane domain, ORMDL proteins, negative regulators, SPT, activity, ORMDL, binding, holoenzyme complex, impaired, cells, expressing, pathogenic, SPTLC1-ALS, alleles, increased, SL, synthesis, lipid signature, C-terminal SPTLC1, variants, peripheral hereditary sensory, autonomic neuropathy type 1, HSAN1, synthesis, 1-deoxysphingolipids, 1-deoxySLs, SPT, metabolizes, L-alanine, L-serine, L-serine, availability, SPTLC1-ALS-expressing cells, increased, 1-deoxySL, SL, profile, ALS, HSAN1-like signature, effect, SPTLC1-ALS, pedigree, index, patient, HSAN1, phenotype, increased, 1-deoxySL, levels, L-serine deficiency, data, pathogenic, variants, domains, SPTLC1, clinical presentations, modifiable, substrate, availability)


In [18]:
add_to_recall(Sci_md_ents)

ALS : 
0.49601253867149353   Amyotrophic lateral sclerosis
progressive : 
0.3143276870250702   patient
motor neurons : 
0.3994305431842804   Amyotrophic lateral sclerosis
Mutations : 
0.28211531043052673   patient
SPTLC1 subunit : 
0.7980115413665771   SPTLC1
serine palmitoyltransferase : 
0.9954445958137512   serine-palmitoyltransferase
de novo : 
0.3745591640472412   1-deoxySL
synthesis : 
0.2679099440574646   L-alanine
SLs : 
0.32089465856552124   SPT
childhood-onset : 
0.2590651512145996   neurodegenerative disease
SPTLC1-ALS : 
0.6890350580215454   SPTLC1
variants : 
0.31245529651641846   patient
transmembrane domain : 
0.3611960709095001   sphingolipids
ORMDL proteins : 
0.29700836539268494   serine-palmitoyltransferase
negative regulators : 
0.25095686316490173   1-deoxySL
activity : 
0.26917219161987305   patient
ORMDL : 
0.2724124491214752   patient
binding : 
0.19948479533195496   patient
holoenzyme complex : 
0.2756074368953705   1-deoxysphingolipids (1-deoxySLs
impaired : 


0.372014543105816

In [21]:
get_ner_recall(Sci_md_ents, 'Sci_md')

ALS : 
0.49601253867149353   Amyotrophic lateral sclerosis
progressive : 
0.3143276870250702   patient
motor neurons : 
0.3994305431842804   Amyotrophic lateral sclerosis
Mutations : 
0.28211531043052673   patient
SPTLC1 subunit : 
0.7980115413665771   SPTLC1
serine palmitoyltransferase : 
0.9954445958137512   serine-palmitoyltransferase
de novo : 
0.3745591640472412   1-deoxySL
synthesis : 
0.2679099440574646   L-alanine
SLs : 
0.32089465856552124   SPT
childhood-onset : 
0.2590651512145996   neurodegenerative disease
SPTLC1-ALS : 
0.6890350580215454   SPTLC1
variants : 
0.31245529651641846   patient
transmembrane domain : 
0.3611960709095001   sphingolipids
ORMDL proteins : 
0.29700836539268494   serine-palmitoyltransferase
negative regulators : 
0.25095686316490173   1-deoxySL
activity : 
0.26917219161987305   patient
ORMDL : 
0.2724124491214752   patient
binding : 
0.19948479533195496   patient
holoenzyme complex : 
0.2756074368953705   1-deoxysphingolipids (1-deoxySLs
impaired : 


In [None]:
Sci_lg_ents = getEntities("en_core_sci_lg")
Sci_lg_triples = getTriples("en_core_sci_lg")
print(Sci_lg_triples)

In [None]:
Sci_craft_ents = getEntities("en_ner_craft_md")
Sci_craft_triples = getTriples("en_ner_craft_md")

In [None]:
Sci_bio_ents = getEntities("en_ner_bionlp13cg_md")
Sci_bio_triples = getTriples("en_ner_bionlp13cg_md")

In [None]:
get_ner_recall(Sci_md_ents, 'Sci_md')

In [None]:
get_ner_recall(Sci_lg_ents, 'Sci_lg')

In [None]:
get_ner_recall(Sci_craft_ents, 'Sci_CRAFT')

In [None]:
get_ner_recall(Sci_bio_ents, 'Sci_BioNLP')