# Entity Extraction+Linking using modern SciSpaCy models

## Initialize Dask Clusters

In [1]:
from dask_saturn.core import describe_sizes

describe_sizes()

{'medium': 'Medium - 2 cores - 4 GB RAM',
 'large': 'Large - 2 cores - 16 GB RAM',
 'xlarge': 'XLarge - 4 cores - 32 GB RAM',
 '2xlarge': '2XLarge - 8 cores - 64 GB RAM',
 '4xlarge': '4XLarge - 16 cores - 128 GB RAM',
 '8xlarge': '8XLarge - 32 cores - 256 GB RAM',
 '12xlarge': '12XLarge - 48 cores - 384 GB RAM',
 '16xlarge': '16XLarge - 64 cores - 512 GB RAM',
 'g4dnxlarge': 'T4-XLarge - 4 cores - 16 GB RAM - 1 GPU',
 'g4dn4xlarge': 'T4-4XLarge - 16 cores - 64 GB RAM - 1 GPU',
 'g4dn8xlarge': 'T4-8XLarge - 32 cores - 128 GB RAM - 1 GPU',
 'p32xlarge': 'V100-2XLarge - 8 cores - 61 GB RAM - 1 GPU',
 'p38xlarge': 'V100-8XLarge - 32 cores - 244 GB RAM - 4 GPU',
 'p316xlarge': 'V100-16XLarge - 64 cores - 488 GB RAM - 8 GPU'}

In [2]:
from dask.distributed import Client, wait
from dask_saturn import SaturnCluster
import time

n_workers = 10
cluster = SaturnCluster(n_workers=n_workers, 
                        scheduler_size='2xlarge', 
                        worker_size='4xlarge', 
                        nthreads=16)
client = Client(cluster)
cluster

[2020-09-18 00:44:21] INFO - dask-saturn | Starting cluster. Status: stopping
[2020-09-18 00:44:30] INFO - dask-saturn | Starting cluster. Status: pending
[2020-09-18 00:44:49] INFO - dask-saturn | Cluster is ready


VBox(children=(HTML(value='<h2>SaturnCluster</h2>'), HBox(children=(HTML(value='\n<div>\n  <style scoped>\n   …

In [3]:
while len(client.scheduler_info()['workers']) < n_workers:
    print('Waiting for workers, got', len(client.scheduler_info()['workers']))
    time.sleep(30)
print('Done!')

Waiting for workers, got 0
Waiting for workers, got 0
Done!


In [4]:
import dask.dataframe as dd
import json
import numpy as np
import pandas as pd
import s3fs
import spacy
import scispacy

from dask.distributed import Client, progress, get_worker

from scispacy.linking_utils import RxNorm

## Processing

In [5]:
# NOTE: see SciSpaCy issue 249, makes logic different from other 04*x notebooks
MODEL_KB = "rxnorm"

In [6]:
BUCKET_NAME = "saturn-elsevierinc"

SENTENCE_FOLDER = "/".join(["s3:/", BUCKET_NAME, "incremental", "add-sents"])
ENTITIES_FOLDER = "/".join(["s3:/", BUCKET_NAME, "incremental",
                            "add-ents-{:s}".format(MODEL_KB)])

In [7]:
sentences_df = dd.read_parquet(SENTENCE_FOLDER, engine="pyarrow")
sentences_df.head(npartitions=20)

Unnamed: 0,cord_uid,pid,sid,stext
0,ug7v899j,T,0,Clinical features of culture-proven Mycoplasma...
0,ug7v899j,A0,0,Objective: This retrospective chart review des...
0,ug7v899j,A1,0,Patients with positive M. pneumoniae cultures ...
0,ug7v899j,A1,1,Charts of patients were reviewed.
0,ug7v899j,A2,0,"Results: 40 patients were identified, 33 (82.5..."


In [8]:
len(sentences_df)

16952279

In [9]:
def build_nlp_pipeline(model_kb):
    nlp = spacy.load("en_core_sci_md")
    kb = RxNorm()
    return nlp, kb


def nlp_workers():
    import traceback
    try:
        worker = get_worker()
        nlp, kb = build_nlp_pipeline(MODEL_KB)
        worker.nlp = nlp
        worker.kb = kb
    except:
        return traceback.format_exc()
    return 0


def check_nlp_workers():
    worker = get_worker()
    return str(worker.nlp)


%time client.run(nlp_workers)

CPU times: user 10.8 ms, sys: 315 µs, total: 11.1 ms
Wall time: 8.87 s


{'tcp://10.0.0.74:35109': 0,
 'tcp://10.0.16.145:35845': 0,
 'tcp://10.0.16.74:36977': 0,
 'tcp://10.0.17.137:41491': 0,
 'tcp://10.0.19.210:34045': 0,
 'tcp://10.0.21.47:34513': 0,
 'tcp://10.0.21.64:43635': 0,
 'tcp://10.0.30.213:33213': 0,
 'tcp://10.0.7.244:45665': 0,
 'tcp://10.0.7.71:40887': 0}

In [10]:
client.run(check_nlp_workers)

{'tcp://10.0.0.74:35109': '<spacy.lang.en.English object at 0x7f861c79dfd0>',
 'tcp://10.0.16.145:35845': '<spacy.lang.en.English object at 0x7fa8048f9cd0>',
 'tcp://10.0.16.74:36977': '<spacy.lang.en.English object at 0x7fbe9ca5ffd0>',
 'tcp://10.0.17.137:41491': '<spacy.lang.en.English object at 0x7f852135efd0>',
 'tcp://10.0.19.210:34045': '<spacy.lang.en.English object at 0x7f1a206901d0>',
 'tcp://10.0.21.47:34513': '<spacy.lang.en.English object at 0x7f63259b3590>',
 'tcp://10.0.21.64:43635': '<spacy.lang.en.English object at 0x7f7a2daacf10>',
 'tcp://10.0.30.213:33213': '<spacy.lang.en.English object at 0x7f244229c050>',
 'tcp://10.0.7.244:45665': '<spacy.lang.en.English object at 0x7f39f9885e10>',
 'tcp://10.0.7.71:40887': '<spacy.lang.en.English object at 0x7f9861a99d10>'}

In [11]:
def handle_batch(sents, nlp, kb, model_kb):
    docs = nlp.pipe(sents, n_threads=16, batch_size=len(sents))
    ents_list = []
    for doc in docs:
        ents = []
        for eid, ent in enumerate(doc.ents):
            try:
                cuis = kb.alias_to_cuis[ent.text]
                for cui in cuis:
                    ents.append((eid, model_kb, ent.text, cui,
                                 1.0, ent.start_char, ent.end_char))
            except KeyError:
                continue
        ents_list.append(ents)
    return ents_list


def handle_partition(part):
    worker = get_worker()
    nlp, kb = worker.nlp, worker.kb
    batch_size = 32
    sent_batch, ent_batch, entities = [], [], []
    for _, row in part.iterrows():
        if len(sent_batch) % batch_size == 0 and len(sent_batch) > 0:
            ent_batch = handle_batch(sent_batch, nlp, kb, MODEL_KB)
            entities.extend(ent_batch)
            sent_batch = []
        try:
            sent_batch.append(row.stext)
        except ValueError:
            continue
    if len(sent_batch) > 0:
        ent_batch = handle_batch(sent_batch, nlp, kb, MODEL_KB)
        entities.extend(ent_batch)
    return entities

In [12]:
entities_df = sentences_df.copy()

In [13]:
entities_df["entities"] = entities_df.map_partitions(
    lambda part: handle_partition(part), meta=("object"))
entities_df = entities_df.drop(columns=["stext"])
entities_df = entities_df.explode("entities")
entities_df = entities_df.dropna()

entities_df["eid"] = entities_df.apply(
    lambda row: row.entities[0], meta=("int"), axis=1)
entities_df["eclass"] = entities_df.apply(
    lambda row: row.entities[1], meta=("str"), axis=1)
entities_df["etext"] = entities_df.apply(
    lambda row: row.entities[2], meta=("str"), axis=1)
entities_df["elabel"] = entities_df.apply(
    lambda row: row.entities[3], meta=("str"), axis=1)
entities_df["escore"] = entities_df.apply(
    lambda row: row.entities[4], meta=("float"), axis=1)
entities_df["ent_start_char"] = entities_df.apply(
    lambda row: row.entities[5], meta=("int"), axis=1)
entities_df["ent_end_char"] = entities_df.apply(
    lambda row: row.entities[6], meta=("int"), axis=1)

entities_df = entities_df.drop(columns=["entities"])

In [14]:
entities_df.cord_uid = entities_df.cord_uid.astype(str)
entities_df.pid = entities_df.pid.astype(str)
entities_df.sid = entities_df.sid.astype(np.int32)
entities_df.eid = entities_df.eid.astype(np.int32)
entities_df.eclass = entities_df.eclass.astype(str)
entities_df.etext = entities_df.etext.astype(str)
entities_df.elabel = entities_df.elabel.astype(str)
entities_df.escore = entities_df.escore.astype(np.float32)
entities_df.ent_start_char = entities_df.ent_start_char.astype(np.int32)
entities_df.ent_end_char = entities_df.ent_end_char.astype(np.int32)

In [15]:
fs = s3fs.S3FileSystem()
if fs.exists(ENTITIES_FOLDER):
    fs.rm(ENTITIES_FOLDER, recursive=True)

In [16]:
%%time
entities_df.to_parquet(ENTITIES_FOLDER, engine="pyarrow", compression="snappy")

CPU times: user 6.41 s, sys: 242 ms, total: 6.65 s
Wall time: 1h 14min 2s


## Verify Result

In [17]:
ENTITIES_FOLDER

's3://saturn-elsevierinc/cord19-ents-rxnorm-pq'

In [18]:
fs.du(ENTITIES_FOLDER) / 1e6

6.505494

In [19]:
entities_df = dd.read_parquet(ENTITIES_FOLDER, engine="pyarrow")
entities_df.head(npartitions=10)

Unnamed: 0,cord_uid,pid,sid,eid,eclass,etext,elabel,escore,ent_start_char,ent_end_char
0,ug7v899j,A2,6,0,rxnorm,Cough,C0010200,1.0,0,5
0,ug7v899j,B5,0,0,rxnorm,Pneumonia,C0032285,1.0,0,9
0,ug7v899j,B5,1,0,rxnorm,Severe,C0205082,1.0,0,6
0,ug7v899j,B9,1,0,rxnorm,Pneumonia,C0032285,1.0,0,9
0,ug7v899j,B11,1,0,rxnorm,Cough,C0010200,1.0,0,5


In [20]:
len(entities_df)

198910

In [21]:
# do this if youre done using the cluster
cluster.close()