# Entity Extraction+Linking using modern SciSpaCy models

## Initialize Dask Clusters

In [1]:
from dask_saturn.core import describe_sizes

describe_sizes()

{'medium': 'Medium - 2 cores - 4 GB RAM',
 'large': 'Large - 2 cores - 16 GB RAM',
 'xlarge': 'XLarge - 4 cores - 32 GB RAM',
 '2xlarge': '2XLarge - 8 cores - 64 GB RAM',
 '4xlarge': '4XLarge - 16 cores - 128 GB RAM',
 '8xlarge': '8XLarge - 32 cores - 256 GB RAM',
 '12xlarge': '12XLarge - 48 cores - 384 GB RAM',
 '16xlarge': '16XLarge - 64 cores - 512 GB RAM',
 'g4dnxlarge': 'T4-XLarge - 4 cores - 16 GB RAM - 1 GPU',
 'g4dn4xlarge': 'T4-4XLarge - 16 cores - 64 GB RAM - 1 GPU',
 'g4dn8xlarge': 'T4-8XLarge - 32 cores - 128 GB RAM - 1 GPU',
 'p32xlarge': 'V100-2XLarge - 8 cores - 61 GB RAM - 1 GPU',
 'p38xlarge': 'V100-8XLarge - 32 cores - 244 GB RAM - 4 GPU',
 'p316xlarge': 'V100-16XLarge - 64 cores - 488 GB RAM - 8 GPU'}

In [2]:
from dask.distributed import Client, wait
from dask_saturn import SaturnCluster
import time

n_workers = 5
cluster = SaturnCluster(n_workers=n_workers, 
                        scheduler_size='2xlarge', 
                        worker_size='4xlarge', 
                        nthreads=16)
client = Client(cluster)
cluster

[2020-10-02 16:25:41] INFO - dask-saturn | Starting cluster. Status: pending
[2020-10-02 16:25:48] INFO - dask-saturn | Starting cluster. Status: pending
[2020-10-02 16:26:02] INFO - dask-saturn | Cluster is ready


VBox(children=(HTML(value='<h2>SaturnCluster</h2>'), HBox(children=(HTML(value='\n<div>\n  <style scoped>\n   …

In [3]:
while len(client.scheduler_info()['workers']) < n_workers:
    print('Waiting for workers, got', len(client.scheduler_info()['workers']))
    time.sleep(30)
print('Done!')

Waiting for workers, got 0
Waiting for workers, got 4
Done!


In [4]:
import dask.dataframe as dd
import json
import numpy as np
import pandas as pd
import s3fs
import spacy
import scispacy

from dask.distributed import Client, progress, get_worker

from scispacy.abbreviation import AbbreviationDetector
from scispacy.linking import EntityLinker

## Processing

In [5]:
# MODEL_KB = "umls"
MODEL_KB = "mesh"
# MODEL_KB = "go"
# MODEL_KB = "hpo"
# MODEL_KB = "rxnorm"

In [6]:
BUCKET_NAME = "saturn-elsevierinc"

SENTENCE_FOLDER = "/".join(["s3:/", BUCKET_NAME, "incremental", "add-sents"])
ENTITIES_FOLDER = "/".join(["s3:/", BUCKET_NAME, "incremental",
                            "add-ents-{:s}".format(MODEL_KB)])

In [7]:
sentences_df = dd.read_parquet(SENTENCE_FOLDER, engine="pyarrow")
sentences_df.head()

Unnamed: 0,cord_uid,pid,sid,stext
38548,l2m8y422,T,0,Correction: Selective laser trabeculoplasty: p...
38563,kwby80nj,T,0,Publishing in the transfusion field: “
38563,kwby80nj,T,1,Like a Bridge Over Trouble Water” in a “The time
38563,kwby80nj,T,2,They Are A Changing” period
38565,9vbwzi8v,T,0,Nachfrage nicht zu bremsen


In [8]:
len(sentences_df)

228953

In [9]:
def build_nlp_pipeline(model_kb):
    nlp = spacy.load("en_core_sci_md")
    
    abbr_detector = AbbreviationDetector(nlp)
    nlp.add_pipe(abbr_detector)
    
    linker = EntityLinker(resolve_abbreviations=True,
                          filter_for_definitions=False,
                          name=model_kb)
    nlp.add_pipe(linker)
    
    return nlp


def nlp_workers():
    import traceback
    try:
        worker = get_worker()
        nlp = build_nlp_pipeline(MODEL_KB)
        worker.nlp = nlp
    except:
        return traceback.format_exc()
    return 0


def check_nlp_workers():
    worker = get_worker()
    return str(worker.nlp)


%time client.run(nlp_workers)

CPU times: user 12.9 ms, sys: 523 µs, total: 13.4 ms
Wall time: 13.9 s


{'tcp://10.0.12.99:36843': 0,
 'tcp://10.0.14.144:40629': 0,
 'tcp://10.0.25.75:34391': 0,
 'tcp://10.0.6.180:33347': 0,
 'tcp://10.0.6.211:35869': 0}

In [10]:
client.run(check_nlp_workers)

{'tcp://10.0.12.99:36843': '<spacy.lang.en.English object at 0x7f2a700e9fd0>',
 'tcp://10.0.14.144:40629': '<spacy.lang.en.English object at 0x7fac94783f90>',
 'tcp://10.0.25.75:34391': '<spacy.lang.en.English object at 0x7f01bc109dd0>',
 'tcp://10.0.6.180:33347': '<spacy.lang.en.English object at 0x7fe8aff88d90>',
 'tcp://10.0.6.211:35869': '<spacy.lang.en.English object at 0x7ff51216a850>'}

In [11]:
def handle_batch(sents, nlp, model_kb):
    docs = nlp.pipe(sents, n_threads=16, batch_size=len(sents))
    ents_list = []
    try:
        for doc in docs:
            ents = []
            for eid, ent in enumerate(doc.ents):
                try:
                    kb_ents = ent._.kb_ents
                    for cid, score in kb_ents:
                        ents.append((eid, model_kb, ent.text, cid, 
                                     score, ent.start_char, ent.end_char))
                except KeyError:
                    continue
            ents_list.append(ents)
    except KeyError:
        pass
    return ents_list


def handle_partition(part):
    worker = get_worker()
    nlp = worker.nlp
    batch_size = 32
    sent_batch, ent_batch, entities = [], [], []
    for _, row in part.iterrows():
        if len(sent_batch) % batch_size == 0 and len(sent_batch) > 0:
            ent_batch = handle_batch(sent_batch, nlp, MODEL_KB)
            entities.extend(ent_batch)
            sent_batch = []
        try:
            sent_batch.append(row.stext)
        except ValueError:
            continue
    if len(sent_batch) > 0:
        ent_batch = handle_batch(sent_batch, nlp, MODEL_KB)
        entities.extend(ent_batch)
    return entities

In [12]:
entities_df = sentences_df.copy()

In [13]:
entities_df["entities"] = entities_df.map_partitions(
    lambda part: handle_partition(part), meta=("object"))
entities_df = entities_df.drop(columns=["stext"])
entities_df = entities_df.explode("entities")
entities_df = entities_df.dropna()

entities_df["eid"] = entities_df.apply(
    lambda row: row.entities[0], meta=("int"), axis=1)
entities_df["eclass"] = entities_df.apply(
    lambda row: row.entities[1], meta=("str"), axis=1)
entities_df["etext"] = entities_df.apply(
    lambda row: row.entities[2], meta=("str"), axis=1)
entities_df["elabel"] = entities_df.apply(
    lambda row: row.entities[3], meta=("str"), axis=1)
entities_df["escore"] = entities_df.apply(
    lambda row: row.entities[4], meta=("float"), axis=1)
entities_df["ent_start_char"] = entities_df.apply(
    lambda row: row.entities[5], meta=("int"), axis=1)
entities_df["ent_end_char"] = entities_df.apply(
    lambda row: row.entities[6], meta=("int"), axis=1)

entities_df = entities_df.drop(columns=["entities"])

In [14]:
entities_df.cord_uid = entities_df.cord_uid.astype(str)
entities_df.pid = entities_df.pid.astype(str)
entities_df.sid = entities_df.sid.astype(np.int32)
entities_df.eid = entities_df.eid.astype(np.int32)
entities_df.eclass = entities_df.eclass.astype(str)
entities_df.etext = entities_df.etext.astype(str)
entities_df.elabel = entities_df.elabel.astype(str)
entities_df.escore = entities_df.escore.astype(np.float32)
entities_df.ent_start_char = entities_df.ent_start_char.astype(np.int32)
entities_df.ent_end_char = entities_df.ent_end_char.astype(np.int32)

In [15]:
fs = s3fs.S3FileSystem()
if fs.exists(ENTITIES_FOLDER):
    fs.rm(ENTITIES_FOLDER, recursive=True)

In [16]:
%%time
entities_df.to_parquet(ENTITIES_FOLDER, engine="pyarrow", compression="snappy")

CPU times: user 356 ms, sys: 43.6 ms, total: 399 ms
Wall time: 8min 32s


## Verify Result

In [17]:
ENTITIES_FOLDER

's3://saturn-elsevierinc/incremental/add-ents-mesh'

In [18]:
fs.du(ENTITIES_FOLDER) / 1e6

24.164175

In [19]:
entities_df = dd.read_parquet(ENTITIES_FOLDER, engine="pyarrow")
entities_df.head()

Unnamed: 0,cord_uid,pid,sid,eid,eclass,etext,elabel,escore,ent_start_char,ent_end_char
38548,l2m8y422,T,0,1,mesh,Selective laser trabeculoplasty,D014130,0.786378,12,43
38563,kwby80nj,T,0,0,mesh,Publishing,D011643,1.0,0,10
38563,kwby80nj,T,0,0,mesh,Publishing,D015871,0.795135,0,10
38563,kwby80nj,T,0,0,mesh,Publishing,D066295,0.736701,0,10
38563,kwby80nj,T,0,0,mesh,Publishing,D000073820,0.73543,0,10


In [20]:
len(entities_df)

2853149

In [21]:
# do this if youre done using the cluster
cluster.close()