# Entity Extraction from old-style SciSpacy NER Models

These models identify the entity span in an input sentence, but don't attempt to separately link to an external taxonomy. The following variations are possible here. Replace the `MODEL_NAME, MODEL_ALIAS` line in the cell below and repeat run to extract named entity information from the chosen model.

We can run this notebook with different values of `MODEL_NAME` and `MODEL_ALIAS` to create different entity dumps from each model.

In [1]:
import dask.dataframe as dd
import json
import numpy as np
import pandas as pd
import s3fs
import spacy
import scispacy

from dask.distributed import Client, progress, get_worker

In [2]:
# MODEL_NAME, MODEL_ALIAS = "en_ner_craft_md", "craft"
# MODEL_NAME, MODEL_ALIAS = "en_ner_jnlpba_md", "jnlpba"
# MODEL_NAME, MODEL_ALIAS = "en_ner_bc5cdr_md", "bc5cdr"
MODEL_NAME, MODEL_ALIAS = "en_ner_bionlp13cg_md", "bionlp"

In [3]:
BUCKET_NAME = "saturn-elsevierinc"

SENTENCE_FOLDER = "/".join(["s3:/", BUCKET_NAME, "cord19-sents-pq-sm"])
ENTITIES_FOLDER = "/".join(["s3:/", BUCKET_NAME, 
                            "cord19-ents-{:s}-pq-sm".format(MODEL_ALIAS)])

In [4]:
sentences_df = dd.read_parquet(SENTENCE_FOLDER, engine="pyarrow")
sentences_df.head(npartitions=10)

Unnamed: 0,cord_uid,pid,sid,stext
6498,sz7qmi8q,A0,0,Schwer punkt: Lun gen-und Pleura pa tho lo gie...
6498,sz7qmi8q,A0,1,für Pa tho lo gie der Ruhr-Uni ver si tät Bo c...
6498,sz7qmi8q,A0,2,fi ka ti on der Er kran kun gen des pul mo
6498,sz7qmi8q,A0,3,na len Sur fac tant-Sys tems TYP I TYP II TYP ...
6498,sz7qmi8q,A0,4,re nenund Säug lings al ter Neu ge bo


In [5]:
len(sentences_df)

7313

## Processing

In [6]:
client = Client(processes=False, n_workers=2, threads_per_worker=1)
client

# from dask.distributed import Client
# from dask_saturn import SaturnCluster

# cluster = SaturnCluster(n_workers=20)
# client = Client(cluster)

Perhaps you already have a cluster running?
Hosting the HTTP server on port 34611 instead
  http_address["port"], self.http_server.port


0,1
Client  Scheduler: inproc://10.0.4.79/173/1  Dashboard: http://10.0.4.79:34611/status,Cluster  Workers: 2  Cores: 2  Memory: 16.25 GB


In [7]:
def handle_batch(sents, nlp, ent_class):
    docs = nlp.pipe(sents, n_threads=16, batch_size=len(sents))
    ents_list = []
    for doc in docs:
        ents = []
        for eid, ent in enumerate(doc.ents):
            ents.append((eid, ent_class, ent.text, ent.label_, 
                         1.0, ent.start_char, ent.end_char))
        ents_list.append(ents)
    return ents_list


def handle_partition(part):
    worker = get_worker()
    try:
        nlp = worker.nlp
    except:
        nlp = spacy.load(MODEL_NAME)
        worker.nlp = nlp
    batch_size = 32
    sent_batch, ent_batch, entities = [], [], []
    for _, row in part.iterrows():
        if len(sent_batch) % batch_size == 0 and len(sent_batch) > 0:
            ent_batch = handle_batch(sent_batch, nlp, MODEL_ALIAS)
            entities.extend(ent_batch)
            sent_batch = []
        try:
            sent_batch.append(row.stext)
        except ValueError:
            continue
    if len(sent_batch) > 0:
        ent_batch = handle_batch(sent_batch, nlp, MODEL_ALIAS)
        entities.extend(ent_batch)
    return entities

In [8]:
entities_df = sentences_df.copy()

In [9]:
entities_df["entities"] = entities_df.map_partitions(
    lambda part: handle_partition(part), meta=("object"))
entities_df = entities_df.drop(columns=["stext"])
entities_df = entities_df.explode("entities")
entities_df = entities_df.dropna()

entities_df["eid"] = entities_df.apply(
    lambda row: row.entities[0], meta=("int"), axis=1)
entities_df["eclass"] = entities_df.apply(
    lambda row: row.entities[1], meta=("str"), axis=1)
entities_df["etext"] = entities_df.apply(
    lambda row: row.entities[2], meta=("str"), axis=1)
entities_df["elabel"] = entities_df.apply(
    lambda row: row.entities[3], meta=("str"), axis=1)
entities_df["escore"] = entities_df.apply(
    lambda row: row.entities[4], meta=("float"), axis=1)
entities_df["ent_start_char"] = entities_df.apply(
    lambda row: row.entities[5], meta=("int"), axis=1)
entities_df["ent_end_char"] = entities_df.apply(
    lambda row: row.entities[6], meta=("int"), axis=1)

entities_df = entities_df.drop(columns=["entities"])

In [10]:
entities_df.cord_uid = entities_df.cord_uid.astype(str)
entities_df.pid = entities_df.pid.astype(str)
entities_df.sid = entities_df.sid.astype(np.int32)
entities_df.eid = entities_df.eid.astype(np.int32)
entities_df.eclass = entities_df.eclass.astype(str)
entities_df.etext = entities_df.etext.astype(str)
entities_df.elabel = entities_df.elabel.astype(str)
entities_df.escore = entities_df.escore.astype(np.float32)
entities_df.ent_start_char = entities_df.ent_start_char.astype(np.int32)
entities_df.ent_end_char = entities_df.ent_end_char.astype(np.int32)

In [11]:
fs = s3fs.S3FileSystem()
if fs.exists(ENTITIES_FOLDER):
    fs.rm(ENTITIES_FOLDER, recursive=True)

In [12]:
entities_df.to_parquet(ENTITIES_FOLDER, engine="pyarrow", compression="snappy")

## Verify Result

In [13]:
entities_df = dd.read_parquet(ENTITIES_FOLDER, engine="pyarrow")
entities_df.head(npartitions=10)

Unnamed: 0,cord_uid,pid,sid,eid,eclass,etext,elabel,escore,ent_start_char,ent_end_char
6498,sz7qmi8q,A0,0,0,bionlp,F.,ORGAN,1.0,59,61
6498,sz7qmi8q,A0,0,1,bionlp,sti,GENE_OR_GENE_PRODUCT,1.0,72,75
6498,sz7qmi8q,A0,1,0,bionlp,Ruhr-Uni,GENE_OR_GENE_PRODUCT,1.0,22,30
6498,sz7qmi8q,A0,1,1,bionlp,Be rufs ge nos sen schaft li chen Kli ni ken,GENE_OR_GENE_PRODUCT,1.0,51,95
6498,sz7qmi8q,A0,1,2,bionlp,Klas,GENE_OR_GENE_PRODUCT,1.0,156,160


In [14]:
len(entities_df)

10701