# Entity Extraction+Linking using modern SciSpaCy models

In [1]:
import dask.dataframe as dd
import json
import numpy as np
import pandas as pd
import s3fs
import spacy
import scispacy

from dask.distributed import Client, progress, get_worker

from scispacy.abbreviation import AbbreviationDetector
from scispacy.linking import EntityLinker

In [2]:
# MODEL_KB = "umls"
# MODEL_KB = "mesh"
# MODEL_KB = "go"
MODEL_KB = "hpo"
# MODEL_KB = "rxnorm"

In [3]:
BUCKET_NAME = "saturn-elsevierinc"

SENTENCE_FOLDER = "/".join(["s3:/", BUCKET_NAME, "cord19-sents-pq-sm"])
ENTITIES_FOLDER = "/".join(["s3:/", BUCKET_NAME, 
                            "cord19-ents-{:s}-pq-sm".format(MODEL_KB)])

In [4]:
sentences_df = dd.read_parquet(SENTENCE_FOLDER, engine="pyarrow")
sentences_df.head(npartitions=10)

Unnamed: 0,cord_uid,pid,sid,stext
6498,sz7qmi8q,A0,0,Schwer punkt: Lun gen-und Pleura pa tho lo gie...
6498,sz7qmi8q,A0,1,für Pa tho lo gie der Ruhr-Uni ver si tät Bo c...
6498,sz7qmi8q,A0,2,fi ka ti on der Er kran kun gen des pul mo
6498,sz7qmi8q,A0,3,na len Sur fac tant-Sys tems TYP I TYP II TYP ...
6498,sz7qmi8q,A0,4,re nenund Säug lings al ter Neu ge bo


In [5]:
len(sentences_df)

7313

## Processing

In [6]:
client = Client(processes=False, n_workers=1, threads_per_worker=1)
client

Perhaps you already have a cluster running?
Hosting the HTTP server on port 46529 instead
  http_address["port"], self.http_server.port


0,1
Client  Scheduler: inproc://10.0.31.96/27879/1  Dashboard: http://10.0.31.96:46529/status,Cluster  Workers: 1  Cores: 1  Memory: 16.25 GB


In [7]:
def build_nlp_pipeline(model_kb):
    nlp = spacy.load("en_core_sci_md")
    
    abbr_detector = AbbreviationDetector(nlp)
    nlp.add_pipe(abbr_detector)
    
    linker = EntityLinker(resolve_abbreviations=True,
                          filter_for_definitions=False,
                          name=model_kb)
    nlp.add_pipe(linker)
    
    return nlp


def nlp_workers():
    import traceback
    try:
        worker = get_worker()
        nlp = build_nlp_pipeline(MODEL_KB)
        worker.nlp = nlp
    except:
        return traceback.format_exc()
    return 0


def check_nlp_workers():
    worker = get_worker()
    return str(worker.nlp)


%time client.run(nlp_workers)



CPU times: user 8.2 s, sys: 593 ms, total: 8.79 s
Wall time: 9.93 s


{'inproc://10.0.31.96/27879/3': 0}

In [8]:
client.run(check_nlp_workers)

{'inproc://10.0.31.96/27879/3': '<spacy.lang.en.English object at 0x7f331d977110>'}

In [9]:
def handle_batch(sents, nlp, model_kb):
    docs = nlp.pipe(sents, n_threads=16, batch_size=len(sents))
    ents_list = []
    for doc in docs:
        ents = []
        for eid, ent in enumerate(doc.ents):
            kb_ents = ent._.kb_ents
            for cid, score in kb_ents:
                ents.append((eid, model_kb, ent.text, cid, 
                             score, ent.start_char, ent.end_char))
        ents_list.append(ents)
    return ents_list


def handle_partition(part):
    worker = get_worker()
    nlp = worker.nlp
    batch_size = 32
    sent_batch, ent_batch, entities = [], [], []
    for _, row in part.iterrows():
        if len(sent_batch) % batch_size == 0 and len(sent_batch) > 0:
            ent_batch = handle_batch(sent_batch, nlp, MODEL_KB)
            entities.extend(ent_batch)
            sent_batch = []
        try:
            sent_batch.append(row.stext)
        except ValueError:
            continue
    if len(sent_batch) > 0:
        ent_batch = handle_batch(sent_batch, nlp, MODEL_KB)
        entities.extend(ent_batch)
    return entities

In [10]:
entities_df = sentences_df.copy()

In [11]:
entities_df["entities"] = entities_df.map_partitions(
    lambda part: handle_partition(part), meta=("object"))
entities_df = entities_df.drop(columns=["stext"])
entities_df = entities_df.explode("entities")
entities_df = entities_df.dropna()

entities_df["eid"] = entities_df.apply(
    lambda row: row.entities[0], meta=("int"), axis=1)
entities_df["eclass"] = entities_df.apply(
    lambda row: row.entities[1], meta=("str"), axis=1)
entities_df["etext"] = entities_df.apply(
    lambda row: row.entities[2], meta=("str"), axis=1)
entities_df["elabel"] = entities_df.apply(
    lambda row: row.entities[3], meta=("str"), axis=1)
entities_df["escore"] = entities_df.apply(
    lambda row: row.entities[4], meta=("float"), axis=1)
entities_df["ent_start_char"] = entities_df.apply(
    lambda row: row.entities[5], meta=("int"), axis=1)
entities_df["ent_end_char"] = entities_df.apply(
    lambda row: row.entities[6], meta=("int"), axis=1)

entities_df = entities_df.drop(columns=["entities"])

In [12]:
entities_df.cord_uid = entities_df.cord_uid.astype(str)
entities_df.pid = entities_df.pid.astype(str)
entities_df.sid = entities_df.sid.astype(np.int32)
entities_df.eid = entities_df.eid.astype(np.int32)
entities_df.eclass = entities_df.eclass.astype(str)
entities_df.etext = entities_df.etext.astype(str)
entities_df.elabel = entities_df.elabel.astype(str)
entities_df.escore = entities_df.escore.astype(np.float32)
entities_df.ent_start_char = entities_df.ent_start_char.astype(np.int32)
entities_df.ent_end_char = entities_df.ent_end_char.astype(np.int32)

In [13]:
fs = s3fs.S3FileSystem()
if fs.exists(ENTITIES_FOLDER):
    fs.rm(ENTITIES_FOLDER, recursive=True)

In [14]:
entities_df.to_parquet(ENTITIES_FOLDER, engine="pyarrow", compression="snappy")

  extended_neighbors[empty_vectors_boolean_flags] = numpy.array(neighbors)[:-1]
  extended_distances[empty_vectors_boolean_flags] = numpy.array(distances)[:-1]


## Verify Result

In [15]:
entities_df = dd.read_parquet(ENTITIES_FOLDER, engine="pyarrow")
entities_df.head(npartitions=10)

Unnamed: 0,cord_uid,pid,sid,eid,eclass,etext,elabel,escore,ent_start_char,ent_end_char
6498,sz7qmi8q,A0,8,0,hpo,Im,C1334260,0.770292,0,2
6498,sz7qmi8q,A2,1,8,hpo,CPI,C0007789,1.0,218,221
6498,sz7qmi8q,A2,1,8,hpo,CPI,C0162674,1.0,218,221
6498,sz7qmi8q,B3,5,0,hpo,Im,C1334260,0.770292,0,2
6498,sz7qmi8q,B5,6,0,hpo,Im,C1334260,0.770292,0,2


In [16]:
len(entities_df)

18750