# Entity Extraction+Linking using modern SciSpaCy models

## Initialize Dask Clusters

In [1]:
from dask_saturn.core import describe_sizes

describe_sizes()

{'medium': 'Medium - 2 cores - 4 GB RAM',
 'large': 'Large - 2 cores - 16 GB RAM',
 'xlarge': 'XLarge - 4 cores - 32 GB RAM',
 '2xlarge': '2XLarge - 8 cores - 64 GB RAM',
 '4xlarge': '4XLarge - 16 cores - 128 GB RAM',
 '8xlarge': '8XLarge - 32 cores - 256 GB RAM',
 '12xlarge': '12XLarge - 48 cores - 384 GB RAM',
 '16xlarge': '16XLarge - 64 cores - 512 GB RAM',
 'g4dnxlarge': 'T4-XLarge - 4 cores - 16 GB RAM - 1 GPU',
 'g4dn4xlarge': 'T4-4XLarge - 16 cores - 64 GB RAM - 1 GPU',
 'g4dn8xlarge': 'T4-8XLarge - 32 cores - 128 GB RAM - 1 GPU',
 'p32xlarge': 'V100-2XLarge - 8 cores - 61 GB RAM - 1 GPU',
 'p38xlarge': 'V100-8XLarge - 32 cores - 244 GB RAM - 4 GPU',
 'p316xlarge': 'V100-16XLarge - 64 cores - 488 GB RAM - 8 GPU'}

In [2]:
from dask.distributed import Client, wait
from dask_saturn import SaturnCluster
import time

n_workers = 10
cluster = SaturnCluster(n_workers=n_workers, 
                        scheduler_size='2xlarge', 
                        worker_size='4xlarge', 
                        nthreads=16)
client = Client(cluster)
cluster

[2020-09-17 17:34:32] INFO - dask-saturn | Starting cluster. Status: pending
[2020-09-17 17:34:37] INFO - dask-saturn | Starting cluster. Status: pending
[2020-09-17 17:34:56] INFO - dask-saturn | Cluster is ready


VBox(children=(HTML(value='<h2>SaturnCluster</h2>'), HBox(children=(HTML(value='\n<div>\n  <style scoped>\n   …

In [3]:
while len(client.scheduler_info()['workers']) < n_workers:
    print('Waiting for workers, got', len(client.scheduler_info()['workers']))
    time.sleep(30)
print('Done!')

Waiting for workers, got 0
Waiting for workers, got 0
Done!


In [4]:
import dask.dataframe as dd
import json
import numpy as np
import pandas as pd
import s3fs
import spacy
import scispacy

from dask.distributed import Client, progress, get_worker

from scispacy.abbreviation import AbbreviationDetector
from scispacy.linking import EntityLinker

## Processing

In [5]:
# MODEL_KB = "umls"
# MODEL_KB = "mesh"
# MODEL_KB = "go"
MODEL_KB = "hpo"
# MODEL_KB = "rxnorm"

In [6]:
BUCKET_NAME = "saturn-elsevierinc"

SENTENCE_FOLDER = "/".join(["s3:/", BUCKET_NAME, "cord19-sents-pqr"])
ENTITIES_FOLDER = "/".join(["s3:/", BUCKET_NAME, 
                            "cord19-ents-{:s}-pq".format(MODEL_KB)])

In [7]:
sentences_df = dd.read_parquet(SENTENCE_FOLDER, engine="pyarrow")
sentences_df.head(npartitions=20)

Unnamed: 0,cord_uid,pid,sid,stext
0,ug7v899j,T,0,Clinical features of culture-proven Mycoplasma...
0,ug7v899j,A0,0,Objective: This retrospective chart review des...
0,ug7v899j,A1,0,Patients with positive M. pneumoniae cultures ...
0,ug7v899j,A1,1,Charts of patients were reviewed.
0,ug7v899j,A2,0,"Results: 40 patients were identified, 33 (82.5..."


In [8]:
len(sentences_df)

16952279

In [9]:
def build_nlp_pipeline(model_kb):
    nlp = spacy.load("en_core_sci_md")
    
    abbr_detector = AbbreviationDetector(nlp)
    nlp.add_pipe(abbr_detector)
    
    linker = EntityLinker(resolve_abbreviations=True,
                          filter_for_definitions=False,
                          name=model_kb)
    nlp.add_pipe(linker)
    
    return nlp


def nlp_workers():
    import traceback
    try:
        worker = get_worker()
        nlp = build_nlp_pipeline(MODEL_KB)
        worker.nlp = nlp
    except:
        return traceback.format_exc()
    return 0


def check_nlp_workers():
    worker = get_worker()
    return str(worker.nlp)


%time client.run(nlp_workers)

CPU times: user 9.58 ms, sys: 3.38 ms, total: 13 ms
Wall time: 12.6 s


{'tcp://10.0.13.221:41357': 0,
 'tcp://10.0.18.164:34669': 0,
 'tcp://10.0.18.197:46173': 0,
 'tcp://10.0.19.210:39561': 0,
 'tcp://10.0.2.193:43959': 0,
 'tcp://10.0.2.194:43887': 0,
 'tcp://10.0.29.203:42685': 0,
 'tcp://10.0.4.144:37035': 0,
 'tcp://10.0.7.193:33495': 0,
 'tcp://10.0.7.87:44987': 0}

In [10]:
client.run(check_nlp_workers)

{'tcp://10.0.13.221:41357': '<spacy.lang.en.English object at 0x7f1434d47fd0>',
 'tcp://10.0.18.164:34669': '<spacy.lang.en.English object at 0x7ff032f1ac50>',
 'tcp://10.0.18.197:46173': '<spacy.lang.en.English object at 0x7fa918a31e50>',
 'tcp://10.0.19.210:39561': '<spacy.lang.en.English object at 0x7f71b51b02d0>',
 'tcp://10.0.2.193:43959': '<spacy.lang.en.English object at 0x7f1c38b78650>',
 'tcp://10.0.2.194:43887': '<spacy.lang.en.English object at 0x7f87c65b4310>',
 'tcp://10.0.29.203:42685': '<spacy.lang.en.English object at 0x7f849fcdd1d0>',
 'tcp://10.0.4.144:37035': '<spacy.lang.en.English object at 0x7fab57098e90>',
 'tcp://10.0.7.193:33495': '<spacy.lang.en.English object at 0x7f2f71dddcd0>',
 'tcp://10.0.7.87:44987': '<spacy.lang.en.English object at 0x7f94df556fd0>'}

In [11]:
def handle_batch(sents, nlp, model_kb):
    docs = nlp.pipe(sents, n_threads=16, batch_size=len(sents))
    ents_list = []
    for doc in docs:
        ents = []
        for eid, ent in enumerate(doc.ents):
            try:
                kb_ents = ent._.kb_ents
                for cid, score in kb_ents:
                    ents.append((eid, model_kb, ent.text, cid, 
                                 score, ent.start_char, ent.end_char))
            except KeyError:
                continue
        ents_list.append(ents)
    return ents_list


def handle_partition(part):
    worker = get_worker()
    nlp = worker.nlp
    batch_size = 32
    sent_batch, ent_batch, entities = [], [], []
    for _, row in part.iterrows():
        if len(sent_batch) % batch_size == 0 and len(sent_batch) > 0:
            ent_batch = handle_batch(sent_batch, nlp, MODEL_KB)
            entities.extend(ent_batch)
            sent_batch = []
        try:
            sent_batch.append(row.stext)
        except ValueError:
            continue
    if len(sent_batch) > 0:
        ent_batch = handle_batch(sent_batch, nlp, MODEL_KB)
        entities.extend(ent_batch)
    return entities

In [12]:
entities_df = sentences_df.copy()

In [13]:
entities_df["entities"] = entities_df.map_partitions(
    lambda part: handle_partition(part), meta=("object"))
entities_df = entities_df.drop(columns=["stext"])
entities_df = entities_df.explode("entities")
entities_df = entities_df.dropna()

entities_df["eid"] = entities_df.apply(
    lambda row: row.entities[0], meta=("int"), axis=1)
entities_df["eclass"] = entities_df.apply(
    lambda row: row.entities[1], meta=("str"), axis=1)
entities_df["etext"] = entities_df.apply(
    lambda row: row.entities[2], meta=("str"), axis=1)
entities_df["elabel"] = entities_df.apply(
    lambda row: row.entities[3], meta=("str"), axis=1)
entities_df["escore"] = entities_df.apply(
    lambda row: row.entities[4], meta=("float"), axis=1)
entities_df["ent_start_char"] = entities_df.apply(
    lambda row: row.entities[5], meta=("int"), axis=1)
entities_df["ent_end_char"] = entities_df.apply(
    lambda row: row.entities[6], meta=("int"), axis=1)

entities_df = entities_df.drop(columns=["entities"])

In [14]:
entities_df.cord_uid = entities_df.cord_uid.astype(str)
entities_df.pid = entities_df.pid.astype(str)
entities_df.sid = entities_df.sid.astype(np.int32)
entities_df.eid = entities_df.eid.astype(np.int32)
entities_df.eclass = entities_df.eclass.astype(str)
entities_df.etext = entities_df.etext.astype(str)
entities_df.elabel = entities_df.elabel.astype(str)
entities_df.escore = entities_df.escore.astype(np.float32)
entities_df.ent_start_char = entities_df.ent_start_char.astype(np.int32)
entities_df.ent_end_char = entities_df.ent_end_char.astype(np.int32)

In [15]:
fs = s3fs.S3FileSystem()
if fs.exists(ENTITIES_FOLDER):
    fs.rm(ENTITIES_FOLDER, recursive=True)

In [16]:
%%time
entities_df.to_parquet(ENTITIES_FOLDER, engine="pyarrow", compression="snappy")

CPU times: user 10.3 s, sys: 1.03 s, total: 11.3 s
Wall time: 3h 2min 54s


## Verify Result

In [17]:
ENTITIES_FOLDER

's3://saturn-elsevierinc/cord19-ents-hpo-pq'

In [18]:
fs.du(ENTITIES_FOLDER) / 1e6

380.468623

In [19]:
entities_df = dd.read_parquet(ENTITIES_FOLDER, engine="pyarrow")
entities_df.head(npartitions=10)

Unnamed: 0,cord_uid,pid,sid,eid,eclass,etext,elabel,escore,ent_start_char,ent_end_char
0,ug7v899j,A1,0,1,hpo,positive,C4293677,0.775612,14,22
0,ug7v899j,A1,0,1,hpo,positive,C4703609,0.719564,14,22
0,ug7v899j,A1,0,1,hpo,positive,C4019252,0.709962,14,22
0,ug7v899j,A1,0,2,hpo,M. pneumoniae,C0032285,0.850097,23,36
0,ug7v899j,A2,1,0,hpo,infections,C1853193,0.824326,5,15


In [20]:
len(entities_df)

44464091

In [21]:
# do this if youre done using the cluster
cluster.close()