# Entity Extraction from old-style SciSpacy NER Models

These models identify the entity span in an input sentence, but don't attempt to separately link to an external taxonomy. The following variations are possible here. Replace the `MODEL_NAME, MODEL_ALIAS` line in the cell below and repeat run to extract named entity information from the chosen model.

We can run this notebook with different values of `MODEL_NAME` and `MODEL_ALIAS` to create different entity dumps from each model.

## Initialize Dask Clusters

In [1]:
from dask_saturn.core import describe_sizes

describe_sizes()

{'medium': 'Medium - 2 cores - 4 GB RAM',
 'large': 'Large - 2 cores - 16 GB RAM',
 'xlarge': 'XLarge - 4 cores - 32 GB RAM',
 '2xlarge': '2XLarge - 8 cores - 64 GB RAM',
 '4xlarge': '4XLarge - 16 cores - 128 GB RAM',
 '8xlarge': '8XLarge - 32 cores - 256 GB RAM',
 '12xlarge': '12XLarge - 48 cores - 384 GB RAM',
 '16xlarge': '16XLarge - 64 cores - 512 GB RAM',
 'g4dnxlarge': 'T4-XLarge - 4 cores - 16 GB RAM - 1 GPU',
 'g4dn4xlarge': 'T4-4XLarge - 16 cores - 64 GB RAM - 1 GPU',
 'g4dn8xlarge': 'T4-8XLarge - 32 cores - 128 GB RAM - 1 GPU',
 'p32xlarge': 'V100-2XLarge - 8 cores - 61 GB RAM - 1 GPU',
 'p38xlarge': 'V100-8XLarge - 32 cores - 244 GB RAM - 4 GPU',
 'p316xlarge': 'V100-16XLarge - 64 cores - 488 GB RAM - 8 GPU'}

In [2]:
from dask.distributed import Client, wait
from dask_saturn import SaturnCluster
import time

n_workers = 5
cluster = SaturnCluster(n_workers=n_workers, 
                        scheduler_size='2xlarge', 
                        worker_size='4xlarge', 
                        nthreads=16)
client = Client(cluster)
cluster

[2020-10-02 15:37:24] INFO - dask-saturn | Starting cluster. Status: pending
[2020-10-02 15:37:31] INFO - dask-saturn | Starting cluster. Status: pending
[2020-10-02 15:37:45] INFO - dask-saturn | Starting cluster. Status: pending
[2020-10-02 15:38:11] INFO - dask-saturn | Cluster is ready


VBox(children=(HTML(value='<h2>SaturnCluster</h2>'), HBox(children=(HTML(value='\n<div>\n  <style scoped>\n   …

In [3]:
while len(client.scheduler_info()['workers']) < n_workers:
    print('Waiting for workers, got', len(client.scheduler_info()['workers']))
    time.sleep(30)
print('Done!')

Waiting for workers, got 0
Waiting for workers, got 3
Done!


## Processing

In [4]:
import dask.dataframe as dd
import json
import numpy as np
import pandas as pd
import s3fs
import spacy
import scispacy

from dask.distributed import Client, progress, get_worker

In [5]:
MODEL_NAME, MODEL_ALIAS = "en_ner_craft_md", "craft"
# MODEL_NAME, MODEL_ALIAS = "en_ner_jnlpba_md", "jnlpba"
# MODEL_NAME, MODEL_ALIAS = "en_ner_bc5cdr_md", "bc5cdr"
# MODEL_NAME, MODEL_ALIAS = "en_ner_bionlp13cg_md", "bionlp"

In [6]:
BUCKET_NAME = "saturn-elsevierinc"

SENTENCE_FOLDER = "/".join(["s3:/", BUCKET_NAME, "incremental", "add-sents"])
ENTITIES_FOLDER = "/".join(["s3:/", BUCKET_NAME, "incremental",
                            "add-ents-{:s}".format(MODEL_ALIAS)])

In [8]:
sentences_df = dd.read_parquet(SENTENCE_FOLDER, engine="pyarrow")
sentences_df.head()

Unnamed: 0,cord_uid,pid,sid,stext
38548,l2m8y422,T,0,Correction: Selective laser trabeculoplasty: p...
38563,kwby80nj,T,0,Publishing in the transfusion field: “
38563,kwby80nj,T,1,Like a Bridge Over Trouble Water” in a “The time
38563,kwby80nj,T,2,They Are A Changing” period
38565,9vbwzi8v,T,0,Nachfrage nicht zu bremsen


In [9]:
len(sentences_df)

228953

In [10]:
def handle_batch(sents, nlp, ent_class):
    docs = nlp.pipe(sents, n_threads=16, batch_size=len(sents))
    ents_list = []
    for doc in docs:
        ents = []
        for eid, ent in enumerate(doc.ents):
            ents.append((eid, ent_class, ent.text, ent.label_, 
                         1.0, ent.start_char, ent.end_char))
        ents_list.append(ents)
    return ents_list


def handle_partition(part):
    worker = get_worker()
    try:
        nlp = worker.nlp
    except:
        nlp = spacy.load(MODEL_NAME)
        worker.nlp = nlp
    batch_size = 32
    sent_batch, ent_batch, entities = [], [], []
    for _, row in part.iterrows():
        if len(sent_batch) % batch_size == 0 and len(sent_batch) > 0:
            ent_batch = handle_batch(sent_batch, nlp, MODEL_ALIAS)
            entities.extend(ent_batch)
            sent_batch = []
        try:
            sent_batch.append(row.stext)
        except ValueError:
            continue
    if len(sent_batch) > 0:
        ent_batch = handle_batch(sent_batch, nlp, MODEL_ALIAS)
        entities.extend(ent_batch)
    return entities

In [11]:
entities_df = sentences_df.copy()

In [12]:
entities_df["entities"] = entities_df.map_partitions(
    lambda part: handle_partition(part), meta=("object"))
entities_df = entities_df.drop(columns=["stext"])
entities_df = entities_df.explode("entities")
entities_df = entities_df.dropna()

entities_df["eid"] = entities_df.apply(
    lambda row: row.entities[0], meta=("int"), axis=1)
entities_df["eclass"] = entities_df.apply(
    lambda row: row.entities[1], meta=("str"), axis=1)
entities_df["etext"] = entities_df.apply(
    lambda row: row.entities[2], meta=("str"), axis=1)
entities_df["elabel"] = entities_df.apply(
    lambda row: row.entities[3], meta=("str"), axis=1)
entities_df["escore"] = entities_df.apply(
    lambda row: row.entities[4], meta=("float"), axis=1)
entities_df["ent_start_char"] = entities_df.apply(
    lambda row: row.entities[5], meta=("int"), axis=1)
entities_df["ent_end_char"] = entities_df.apply(
    lambda row: row.entities[6], meta=("int"), axis=1)

entities_df = entities_df.drop(columns=["entities"])

In [13]:
entities_df.cord_uid = entities_df.cord_uid.astype(str)
entities_df.pid = entities_df.pid.astype(str)
entities_df.sid = entities_df.sid.astype(np.int32)
entities_df.eid = entities_df.eid.astype(np.int32)
entities_df.eclass = entities_df.eclass.astype(str)
entities_df.etext = entities_df.etext.astype(str)
entities_df.elabel = entities_df.elabel.astype(str)
entities_df.escore = entities_df.escore.astype(np.float32)
entities_df.ent_start_char = entities_df.ent_start_char.astype(np.int32)
entities_df.ent_end_char = entities_df.ent_end_char.astype(np.int32)

In [14]:
fs = s3fs.S3FileSystem()
if fs.exists(ENTITIES_FOLDER):
    fs.rm(ENTITIES_FOLDER, recursive=True)

In [15]:
%%time
entities_df.to_parquet(ENTITIES_FOLDER, engine="pyarrow", compression="snappy")

CPU times: user 169 ms, sys: 42 ms, total: 211 ms
Wall time: 3min 7s


## Verify Result

In [16]:
ENTITIES_FOLDER

's3://saturn-elsevierinc/incremental/add-ents-craft'

In [21]:
fs.du(ENTITIES_FOLDER) / 1e6

1.811986

In [22]:
entities_df = dd.read_parquet(ENTITIES_FOLDER, engine="pyarrow")
entities_df.head()

Unnamed: 0,cord_uid,pid,sid,eid,eclass,etext,elabel,escore,ent_start_char,ent_end_char
38563,kwby80nj,T,1,0,craft,Water,CHEBI,1.0,27,32
38584,7y9ewt2v,T,0,0,craft,viruses,TAXON,1.0,21,28
38622,3qupx9yp,T,0,0,craft,Stem Cell,CL,1.0,11,20
38622,3qupx9yp,T,0,1,craft,Vesicles/Exosomes,GGP,1.0,158,175
38726,oqf6kopn,A0,2,0,craft,mukosalen,TAXON,1.0,185,194


In [23]:
len(entities_df)

156962

In [24]:
# do this if youre done using the cluster
cluster.close()

distributed.client - ERROR - Failed to reconnect to scheduler after 10.00 seconds, closing client
_GatheringFuture exception was never retrieved
future: <_GatheringFuture finished exception=CancelledError()>
concurrent.futures._base.CancelledError
