# Incremental Post-processor

This notebook uses the `incremental/added` and `incremental/deleted` data generated by `05ax-incremental-pre` and applies it to each of the outputs of the full pipeline.

* Run the entire pipeline on the paragraphs from the `incremental/added` file. This is the `incremental/[entity_name]` file.
* Read the `[entity_name]` file from previous run.
* Apply the deletions from the `incremental/deleted` files to `[output_entity_name]` file.
* Concatenate the additions from `incremental/[entity_name]` files.
* Repartition and write output file `[output_entity_name]`.

## Initialize Dask Cluster

In [2]:
from dask_saturn.core import describe_sizes

describe_sizes()

{'medium': 'Medium - 2 cores - 4 GB RAM',
 'large': 'Large - 2 cores - 16 GB RAM',
 'xlarge': 'XLarge - 4 cores - 32 GB RAM',
 '2xlarge': '2XLarge - 8 cores - 64 GB RAM',
 '4xlarge': '4XLarge - 16 cores - 128 GB RAM',
 '8xlarge': '8XLarge - 32 cores - 256 GB RAM',
 '12xlarge': '12XLarge - 48 cores - 384 GB RAM',
 '16xlarge': '16XLarge - 64 cores - 512 GB RAM',
 'g4dnxlarge': 'T4-XLarge - 4 cores - 16 GB RAM - 1 GPU',
 'g4dn4xlarge': 'T4-4XLarge - 16 cores - 64 GB RAM - 1 GPU',
 'g4dn8xlarge': 'T4-8XLarge - 32 cores - 128 GB RAM - 1 GPU',
 'p32xlarge': 'V100-2XLarge - 8 cores - 61 GB RAM - 1 GPU',
 'p38xlarge': 'V100-8XLarge - 32 cores - 244 GB RAM - 4 GPU',
 'p316xlarge': 'V100-16XLarge - 64 cores - 488 GB RAM - 8 GPU'}

In [3]:
from dask.distributed import Client, wait
from dask_saturn import SaturnCluster
import time

n_workers = 10
cluster = SaturnCluster(n_workers=n_workers,
                        scheduler_size="2xlarge",
                        worker_size="4xlarge",
                        nthreads=16)
client = Client(cluster)
cluster

[2020-10-02 17:22:50] INFO - dask-saturn | Starting cluster. Status: pending
[2020-10-02 17:22:57] INFO - dask-saturn | Starting cluster. Status: pending
[2020-10-02 17:23:09] INFO - dask-saturn | Starting cluster. Status: pending
[2020-10-02 17:23:41] INFO - dask-saturn | Starting cluster. Status: pending
[2020-10-02 17:24:41] INFO - dask-saturn | Starting cluster. Status: pending
[2020-10-02 17:25:38] INFO - dask-saturn | Starting cluster. Status: pending
[2020-10-02 17:26:26] INFO - dask-saturn | Starting cluster. Status: pending
[2020-10-02 17:27:21] INFO - dask-saturn | Starting cluster. Status: pending
[2020-10-02 17:28:02] INFO - dask-saturn | Cluster is ready


VBox(children=(HTML(value='<h2>SaturnCluster</h2>'), HBox(children=(HTML(value='\n<div>\n  <style scoped>\n   …

In [4]:
while len(client.scheduler_info()["workers"]) < n_workers:
    print("Waiting for workers, got", len(client.scheduler_info()["workers"]))
    time.sleep(30)
print("Done!")

Waiting for workers, got 0
Waiting for workers, got 0
Waiting for workers, got 0
Waiting for workers, got 0
Waiting for workers, got 0
Waiting for workers, got 0
Waiting for workers, got 0
Waiting for workers, got 0
Waiting for workers, got 0
Done!


## Setup

In [5]:
import boto3
import dask.dataframe as dd
import json
import numpy as np
import os
import pandas as pd
import s3fs

from dask.distributed import Client, progress, get_worker

In [6]:
BUCKET_NAME = "s3://saturn-elsevierinc"

D1 = "2020-08-28"
D2 = "2020-09-28"

DIFF_ADD_FOLDER = "/".join([BUCKET_NAME, "incremental", "added"])
DIFF_DEL_FOLDER = "/".join([BUCKET_NAME, "incremental", "deleted"])

PARA_FULL_FOLDER = "/".join([BUCKET_NAME, "cord19-paras-pq"])
PARA_INC_FOLDER = "/".join([BUCKET_NAME, "incremental", "added"])
PARA_MERGED_FOLDER = "/".join([BUCKET_NAME, D2, "cord19-paras-pq"])

SENT_FULL_FOLDER = "/".join([BUCKET_NAME, "cord19-sents-pqr"])
SENT_INC_FOLDER = "/".join([BUCKET_NAME, "incremental", "add-sents"])
SENT_MERGED_FOLDER = "/".join([BUCKET_NAME, D2, "cord19-sents-pq"])

MODEL_NAMES = [
    "craft", "jnlpba", "bc5cdr", "bionlp",
    "umls", "mesh", "go", "hpo", "rxnorm"
]
ENT_FULL_FOLDER_T = "/".join([BUCKET_NAME, "cord19-ents-{:s}-pq"])
ENT_INC_FOLDER_T = "/".join([BUCKET_NAME, "incremental", "add-ents-{:s}"])
ENT_MERGED_FOLDER_T = "/".join([BUCKET_NAME, D2, "cord19-ents-{:s}-pq"])

In [7]:
fs = s3fs.S3FileSystem()

In [19]:
def delete_diff_rows(full_df, del_df):
    # remove rows listed in del_df from para_full_df
    result_df = full_df.merge(del_df, how="outer", indicator=True,
                              on=["cord_uid"], suffixes=["_d1", "_d2"])
    result_df = result_df[result_df._merge == "left_only"]

    # drop extra cols acquired from del_df, rename para_full_df cols
    drop_cols = [x for x in result_df.columns if x.endswith("_d2")]
    rename_cols = {x : x[0:-3] for x in result_df.columns if x.endswith("_d1")}
    if len(drop_cols) > 0:
        result_df = result_df.drop(columns=drop_cols, axis=1)
    if len(rename_cols) > 0:
        result_df = result_df.rename(columns=rename_cols)

    return result_df


def add_diff_rows(full_df, inc_df):
    result_df = dd.concat([full_df, inc_df], axis=0, interleave_partitions=True)
    return result_df


def set_entity_dtypes(entities_df):
    entities_df.cord_uid = entities_df.cord_uid.astype(str)
    entities_df.pid = entities_df.pid.astype(str)
    entities_df.sid = entities_df.sid.astype(np.int32)
    entities_df.eid = entities_df.eid.astype(np.int32)
    entities_df.eclass = entities_df.eclass.astype(str)
    entities_df.etext = entities_df.etext.astype(str)
    entities_df.elabel = entities_df.elabel.astype(str)
    entities_df.escore = entities_df.escore.astype(np.float32)
    entities_df.ent_start_char = entities_df.ent_start_char.astype(np.int32)
    entities_df.ent_end_char = entities_df.ent_end_char.astype(np.int32)
    return entities_df

## Addition and Deletion Lists

In [9]:
del_df = dd.read_parquet(DIFF_DEL_FOLDER, engine="pyarrow")
add_df = dd.read_parquet(DIFF_ADD_FOLDER, engine="pyarrow")

## Paragraphs

In [10]:
print("Input: {:s}".format(PARA_FULL_FOLDER))
print("Incremental: {:s}".format(PARA_INC_FOLDER))
print("Merged: {:s}".format(PARA_MERGED_FOLDER))

Input: s3://saturn-elsevierinc/cord19-paras-pq
Incremental: s3://saturn-elsevierinc/incremental/added
Merged: s3://saturn-elsevierinc/2020-09-28/cord19-paras-pq


In [11]:
para_full_df = dd.read_parquet(PARA_FULL_FOLDER, engine="pyarrow")
para_inc_df = dd.read_parquet(PARA_INC_FOLDER, engine="pyarrow")

In [20]:
para_df = delete_diff_rows(para_full_df, del_df)
para_df = add_diff_rows(para_df, para_inc_df)

para_df.cord_uid = para_df.cord_uid.astype(str)
para_df.pid = para_df.pid.astype(str)
para_df.ptext = para_df.ptext.astype(str)

In [21]:
if fs.exists(PARA_MERGED_FOLDER):
    fs.rm(PARA_MERGED_FOLDER, recursive=True)

In [22]:
%%time
para_df = para_df.repartition(partition_size="20MB")

CPU times: user 421 ms, sys: 4.29 ms, total: 425 ms
Wall time: 7.41 s


In [23]:
%%time
para_df.to_parquet(PARA_MERGED_FOLDER, engine="pyarrow", compression="snappy")

CPU times: user 575 ms, sys: 24 ms, total: 599 ms
Wall time: 9.47 s


In [28]:
fs.du(PARA_MERGED_FOLDER) / 1e6

0.0

In [29]:
para_df = dd.read_parquet(PARA_MERGED_FOLDER, engine="pyarrow")
para_df.head()

OSError: Passed non-file path: saturn-elsevierinc/2020-09-28/cord19-paras-pq

In [27]:
len(para_df)

3442413

## Sentences

In [30]:
print("Input: {:s}".format(SENT_FULL_FOLDER))
print("Incremental: {:s}".format(SENT_INC_FOLDER))
print("Merged: {:s}".format(SENT_MERGED_FOLDER))

Input: s3://saturn-elsevierinc/cord19-sents-pqr
Incremental: s3://saturn-elsevierinc/incremental/add-sents
Merged: s3://saturn-elsevierinc/2020-09-28/cord19-sents-pq


In [31]:
sent_full_df = dd.read_parquet(SENT_FULL_FOLDER, engine="pyarrow")
sent_inc_df = dd.read_parquet(SENT_INC_FOLDER, engine="pyarrow")

In [32]:
sent_df = delete_diff_rows(sent_full_df, del_df)
sent_df = add_diff_rows(sent_df, sent_inc_df)

sent_df.cord_uid = sent_df.cord_uid.astype(str)
sent_df.pid = sent_df.pid.astype(str)
sent_df.sid = sent_df.sid.astype(np.int32)
sent_df.stext = sent_df.stext.astype(str)

In [33]:
if fs.exists(SENT_MERGED_FOLDER):
    fs.rm(SENT_MERGED_FOLDER, recursive=True)

In [34]:
%%time
sent_df = sent_df.repartition(partition_size="20MB")

CPU times: user 1.81 s, sys: 80 ms, total: 1.89 s
Wall time: 18.2 s


In [35]:
%%time
sent_df.to_parquet(SENT_MERGED_FOLDER, engine="pyarrow", compression="snappy")

CPU times: user 2.27 s, sys: 73 ms, total: 2.34 s
Wall time: 21.9 s


In [36]:
fs.du(SENT_MERGED_FOLDER) / 1e6

1539.924589

In [37]:
sent_df = dd.read_parquet(SENT_MERGED_FOLDER, engine="pyarrow")
sent_df.head()

Unnamed: 0,cord_uid,pid,sid,stext,title,abstract,pdf_json_files,pmc_json_files,_merge
0,zzkkm496,T,0,Outcome of paediatric intensive care survivors,,,,,left_only
1,zzkkm496,A0,0,The development of paediatric intensive care h...,,,,,left_only
2,zzkkm496,A0,1,Physical and psychological sequelae and conseq...,,,,,left_only
3,zzkkm496,A0,2,Awareness of sequelae due to the original illn...,,,,,left_only
4,zzkkm496,A0,3,To determine the current knowledge on physical...,,,,,left_only


In [38]:
len(sent_df)

17162879

## Entities (0): CRAFT

In [39]:
i = 0
ent_full_folder = ENT_FULL_FOLDER_T.format(MODEL_NAMES[i])
ent_inc_folder = ENT_INC_FOLDER_T.format(MODEL_NAMES[i])
ent_merged_folder = ENT_MERGED_FOLDER_T.format(MODEL_NAMES[i])

print("Input: {:s}".format(ent_full_folder))
print("Incremental: {:s}".format(ent_inc_folder))
print("Merged: {:s}".format(ent_merged_folder))

Input: s3://saturn-elsevierinc/cord19-ents-craft-pq
Incremental: s3://saturn-elsevierinc/incremental/add-ents-craft
Merged: s3://saturn-elsevierinc/2020-09-28/cord19-ents-craft-pq


In [40]:
ent_full_df = dd.read_parquet(ent_full_folder, engine="pyarrow")
ent_inc_df = dd.read_parquet(ent_inc_folder, engine="pyarrow")

In [41]:
ent_df = delete_diff_rows(ent_full_df, del_df)
ent_df = add_diff_rows(ent_df, ent_inc_df)
ent_df = set_entity_dtypes(ent_df)

In [42]:
if fs.exists(ent_merged_folder):
    fs.rm(ent_merged_folder, recursive=True)

In [43]:
%%time
ent_df = ent_df.repartition(partition_size="20MB")

CPU times: user 420 ms, sys: 24.2 ms, total: 444 ms
Wall time: 11.7 s


In [44]:
%%time
ent_df.to_parquet(ent_merged_folder, engine="pyarrow", compression="snappy")

CPU times: user 676 ms, sys: 16.4 ms, total: 692 ms
Wall time: 13.5 s


In [45]:
fs.du(ent_merged_folder) / 1e6

238.699419

In [46]:
ent_df = dd.read_parquet(ent_merged_folder, engine="pyarrow")
ent_df.head()

Unnamed: 0,cord_uid,pid,sid,eid,eclass,etext,elabel,escore,ent_start_char,ent_end_char,title,abstract,pdf_json_files,pmc_json_files,_merge
0,22veehj5,B0,0,0,craft,Human,TAXON,1.0,0,5,,,,,left_only
1,22veehj5,B0,0,1,craft,malarial,TAXON,1.0,56,64,,,,,left_only
2,22veehj5,B0,0,2,craft,parasites,CL,1.0,78,87,,,,,left_only
3,22veehj5,B0,0,3,craft,P. malariae,CL,1.0,107,118,,,,,left_only
4,22veehj5,B0,0,4,craft,P.,CL,1.0,133,135,,,,,left_only


In [47]:
len(ent_df)

16761104

## Entities (1): JNLPBA

In [48]:
i = 1
ent_full_folder = ENT_FULL_FOLDER_T.format(MODEL_NAMES[i])
ent_inc_folder = ENT_INC_FOLDER_T.format(MODEL_NAMES[i])
ent_merged_folder = ENT_MERGED_FOLDER_T.format(MODEL_NAMES[i])

print("Input: {:s}".format(ent_full_folder))
print("Incremental: {:s}".format(ent_inc_folder))
print("Merged: {:s}".format(ent_merged_folder))

Input: s3://saturn-elsevierinc/cord19-ents-jnlpba-pq
Incremental: s3://saturn-elsevierinc/incremental/add-ents-jnlpba
Merged: s3://saturn-elsevierinc/2020-09-28/cord19-ents-jnlpba-pq


In [49]:
ent_full_df = dd.read_parquet(ent_full_folder, engine="pyarrow")
ent_inc_df = dd.read_parquet(ent_inc_folder, engine="pyarrow")

In [50]:
ent_df = delete_diff_rows(ent_full_df, del_df)
ent_df = add_diff_rows(ent_df, ent_inc_df)
ent_df = set_entity_dtypes(ent_df)

In [51]:
if fs.exists(ent_merged_folder):
    fs.rm(ent_merged_folder, recursive=True)

In [52]:
%%time
ent_df = ent_df.repartition(partition_size="20MB")

CPU times: user 380 ms, sys: 12.6 ms, total: 392 ms
Wall time: 9.55 s


In [53]:
%%time
ent_df.to_parquet(ent_merged_folder, engine="pyarrow", compression="snappy")

CPU times: user 562 ms, sys: 27.7 ms, total: 590 ms
Wall time: 10.7 s


In [54]:
fs.du(ent_merged_folder) / 1e6

192.871806

In [55]:
ent_df = dd.read_parquet(ent_merged_folder, engine="pyarrow")
ent_df.head()

Unnamed: 0,cord_uid,pid,sid,eid,eclass,etext,elabel,escore,ent_start_char,ent_end_char,title,abstract,pdf_json_files,pmc_json_files,_merge
0,22veehj5,B0,3,0,jnlpba,invade hepatocytes,CELL_TYPE,1.0,151,169,,,,,left_only
1,22veehj5,B0,3,1,jnlpba,haploid merozoites,CELL_TYPE,1.0,278,296,,,,,left_only
2,22veehj5,B0,4,0,jnlpba,red blood cells,CELL_TYPE,1.0,26,41,,,,,left_only
3,22veehj5,B0,4,1,jnlpba,RBCs,CELL_TYPE,1.0,43,47,,,,,left_only
4,22veehj5,B0,6,0,jnlpba,lysed infected RBC,CELL_TYPE,1.0,29,47,,,,,left_only


In [56]:
len(ent_df)

10808372

## Entities (2): BC5CDR

In [57]:
i = 2
ent_full_folder = ENT_FULL_FOLDER_T.format(MODEL_NAMES[i])
ent_inc_folder = ENT_INC_FOLDER_T.format(MODEL_NAMES[i])
ent_merged_folder = ENT_MERGED_FOLDER_T.format(MODEL_NAMES[i])

print("Input: {:s}".format(ent_full_folder))
print("Incremental: {:s}".format(ent_inc_folder))
print("Merged: {:s}".format(ent_merged_folder))

Input: s3://saturn-elsevierinc/cord19-ents-bc5cdr-pq
Incremental: s3://saturn-elsevierinc/incremental/add-ents-bc5cdr
Merged: s3://saturn-elsevierinc/2020-09-28/cord19-ents-bc5cdr-pq


In [58]:
ent_full_df = dd.read_parquet(ent_full_folder, engine="pyarrow")
ent_inc_df = dd.read_parquet(ent_inc_folder, engine="pyarrow")

In [59]:
ent_df = delete_diff_rows(ent_full_df, del_df)
ent_df = add_diff_rows(ent_df, ent_inc_df)
ent_df = set_entity_dtypes(ent_df)

In [60]:
if fs.exists(ent_merged_folder):
    fs.rm(ent_merged_folder, recursive=True)

In [61]:
%%time
ent_df = ent_df.repartition(partition_size="20MB")

CPU times: user 419 ms, sys: 16.8 ms, total: 436 ms
Wall time: 9.67 s


In [62]:
%%time
ent_df.to_parquet(ent_merged_folder, engine="pyarrow", compression="snappy")

CPU times: user 554 ms, sys: 31.7 ms, total: 586 ms
Wall time: 10.9 s


In [63]:
fs.du(ent_merged_folder) / 1e6

183.310804

In [64]:
ent_df = dd.read_parquet(ent_merged_folder, engine="pyarrow")
ent_df.head()

Unnamed: 0,cord_uid,pid,sid,eid,eclass,etext,elabel,escore,ent_start_char,ent_end_char,title,abstract,pdf_json_files,pmc_json_files,_merge
0,22veehj5,B0,0,0,bc5cdr,infection,DISEASE,1.0,29,38,,,,,left_only
1,22veehj5,B0,0,1,bc5cdr,Plasmodium,DISEASE,1.0,66,76,,,,,left_only
2,22veehj5,B0,0,2,bc5cdr,Plasmodium ovale,DISEASE,1.0,89,105,,,,,left_only
3,22veehj5,B0,0,3,bc5cdr,P. malariae,DISEASE,1.0,107,118,,,,,left_only
4,22veehj5,B0,0,4,bc5cdr,P. knowlesi,DISEASE,1.0,120,131,,,,,left_only


In [65]:
len(ent_df)

11494481

## Entities (3): BioNLP

In [66]:
i = 3
ent_full_folder = ENT_FULL_FOLDER_T.format(MODEL_NAMES[i])
ent_inc_folder = ENT_INC_FOLDER_T.format(MODEL_NAMES[i])
ent_merged_folder = ENT_MERGED_FOLDER_T.format(MODEL_NAMES[i])

print("Input: {:s}".format(ent_full_folder))
print("Incremental: {:s}".format(ent_inc_folder))
print("Merged: {:s}".format(ent_merged_folder))

Input: s3://saturn-elsevierinc/cord19-ents-bionlp-pq
Incremental: s3://saturn-elsevierinc/incremental/add-ents-bionlp
Merged: s3://saturn-elsevierinc/2020-09-28/cord19-ents-bionlp-pq


In [67]:
ent_full_df = dd.read_parquet(ent_full_folder, engine="pyarrow")
ent_inc_df = dd.read_parquet(ent_inc_folder, engine="pyarrow")

In [68]:
ent_df = delete_diff_rows(ent_full_df, del_df)
ent_df = add_diff_rows(ent_df, ent_inc_df)
ent_df = set_entity_dtypes(ent_df)

In [69]:
if fs.exists(ent_merged_folder):
    fs.rm(ent_merged_folder, recursive=True)

In [70]:
%%time
ent_df = ent_df.repartition(partition_size="20MB")

CPU times: user 426 ms, sys: 15.7 ms, total: 441 ms
Wall time: 18 s


In [71]:
%%time
ent_df.to_parquet(ent_merged_folder, engine="pyarrow", compression="snappy")

CPU times: user 829 ms, sys: 35.8 ms, total: 865 ms
Wall time: 20 s


In [72]:
fs.du(ent_merged_folder) / 1e6

403.808422

In [73]:
ent_df = dd.read_parquet(ent_merged_folder, engine="pyarrow")
ent_df.head()

Unnamed: 0,cord_uid,pid,sid,eid,eclass,etext,elabel,escore,ent_start_char,ent_end_char,title,abstract,pdf_json_files,pmc_json_files,_merge
0,22veehj5,B0,0,0,bionlp,Human malarias,ORGANISM,1.0,0,14,,,,,left_only
1,22veehj5,B0,0,1,bionlp,Plasmodium) parasites,ORGANISM,1.0,66,87,,,,,left_only
2,22veehj5,B0,0,2,bionlp,Plasmodium ovale,SIMPLE_CHEMICAL,1.0,89,105,,,,,left_only
3,22veehj5,B0,0,3,bionlp,P.,SIMPLE_CHEMICAL,1.0,107,109,,,,,left_only
4,22veehj5,B0,0,4,bionlp,P. knowlesi,ORGANISM,1.0,120,131,,,,,left_only


In [74]:
len(ent_df)

25492781

## Entities (4): UMLS

In [75]:
i = 4
ent_full_folder = ENT_FULL_FOLDER_T.format(MODEL_NAMES[i])
ent_inc_folder = ENT_INC_FOLDER_T.format(MODEL_NAMES[i])
ent_merged_folder = ENT_MERGED_FOLDER_T.format(MODEL_NAMES[i])

print("Input: {:s}".format(ent_full_folder))
print("Incremental: {:s}".format(ent_inc_folder))
print("Merged: {:s}".format(ent_merged_folder))

Input: s3://saturn-elsevierinc/cord19-ents-umls-pq
Incremental: s3://saturn-elsevierinc/incremental/add-ents-umls
Merged: s3://saturn-elsevierinc/2020-09-28/cord19-ents-umls-pq


In [76]:
ent_full_df = dd.read_parquet(ent_full_folder, engine="pyarrow")
ent_inc_df = dd.read_parquet(ent_inc_folder, engine="pyarrow")

In [77]:
ent_df = delete_diff_rows(ent_full_df, del_df)
ent_df = add_diff_rows(ent_df, ent_inc_df)
ent_df = set_entity_dtypes(ent_df)

In [78]:
if fs.exists(ent_merged_folder):
    fs.rm(ent_merged_folder, recursive=True)

In [79]:
%%time
ent_df = ent_df.repartition(partition_size="20MB")

CPU times: user 2.18 s, sys: 65.3 ms, total: 2.24 s
Wall time: 6min 10s


In [80]:
%%time
ent_df.to_parquet(ent_merged_folder, engine="pyarrow", compression="snappy")

CPU times: user 10.6 s, sys: 488 ms, total: 11 s
Wall time: 8min 22s


In [81]:
fs.du(ent_merged_folder) / 1e6

6915.902744

In [82]:
ent_df = dd.read_parquet(ent_merged_folder, engine="pyarrow")
ent_df.head()

Unnamed: 0,cord_uid,pid,sid,eid,eclass,etext,elabel,escore,ent_start_char,ent_end_char,title,abstract,pdf_json_files,pmc_json_files,_merge
0,zzkkm496,T,0,0,umls,Outcome,C1274040,1.0,0,7,,,,,left_only
1,zzkkm496,T,0,0,umls,Outcome,C2735114,0.910458,0,7,,,,,left_only
2,zzkkm496,T,0,0,umls,Outcome,C2735113,0.910458,0,7,,,,,left_only
3,zzkkm496,T,0,0,umls,Outcome,C1550525,0.879695,0,7,,,,,left_only
4,zzkkm496,T,0,0,umls,Outcome,C1272708,0.878289,0,7,,,,,left_only


In [83]:
len(ent_df)

451351602

## Entities (5): MeSH

In [84]:
i = 5
ent_full_folder = ENT_FULL_FOLDER_T.format(MODEL_NAMES[i])
ent_inc_folder = ENT_INC_FOLDER_T.format(MODEL_NAMES[i])
ent_merged_folder = ENT_MERGED_FOLDER_T.format(MODEL_NAMES[i])

print("Input: {:s}".format(ent_full_folder))
print("Incremental: {:s}".format(ent_inc_folder))
print("Merged: {:s}".format(ent_merged_folder))

Input: s3://saturn-elsevierinc/cord19-ents-mesh-pq
Incremental: s3://saturn-elsevierinc/incremental/add-ents-mesh
Merged: s3://saturn-elsevierinc/2020-09-28/cord19-ents-mesh-pq


In [85]:
ent_full_df = dd.read_parquet(ent_full_folder, engine="pyarrow")
ent_inc_df = dd.read_parquet(ent_inc_folder, engine="pyarrow")

In [86]:
ent_df = delete_diff_rows(ent_full_df, del_df)
ent_df = add_diff_rows(ent_df, ent_inc_df)
ent_df = set_entity_dtypes(ent_df)

In [87]:
if fs.exists(ent_merged_folder):
    fs.rm(ent_merged_folder, recursive=True)

In [88]:
%%time
ent_df = ent_df.repartition(partition_size="20MB")

CPU times: user 471 ms, sys: 14.6 ms, total: 486 ms
Wall time: 2min 23s


In [89]:
%%time
ent_df.to_parquet(ent_merged_folder, engine="pyarrow", compression="snappy")

CPU times: user 3.82 s, sys: 154 ms, total: 3.98 s
Wall time: 3min 38s


In [90]:
fs.du(ent_merged_folder) / 1e6

2968.218539

In [91]:
ent_df = dd.read_parquet(ent_merged_folder, engine="pyarrow")
ent_df.head()

Unnamed: 0,cord_uid,pid,sid,eid,eclass,etext,elabel,escore,ent_start_char,ent_end_char,title,abstract,pdf_json_files,pmc_json_files,_merge
0,22veehj5,B0,0,1,mesh,infection,D012192,1.0,29,38,,,,,left_only
1,22veehj5,B0,0,1,mesh,infection,D007239,1.0,29,38,,,,,left_only
2,22veehj5,B0,0,1,mesh,infection,D010031,0.797104,29,38,,,,,left_only
3,22veehj5,B0,0,1,mesh,infection,D001447,0.769331,29,38,,,,,left_only
4,22veehj5,B0,0,1,mesh,infection,D006192,0.759506,29,38,,,,,left_only


In [92]:
len(ent_df)

185228588

## Entities (6): GO

In [93]:
i = 6
ent_full_folder = ENT_FULL_FOLDER_T.format(MODEL_NAMES[i])
ent_inc_folder = ENT_INC_FOLDER_T.format(MODEL_NAMES[i])
ent_merged_folder = ENT_MERGED_FOLDER_T.format(MODEL_NAMES[i])

print("Input: {:s}".format(ent_full_folder))
print("Incremental: {:s}".format(ent_inc_folder))
print("Merged: {:s}".format(ent_merged_folder))

Input: s3://saturn-elsevierinc/cord19-ents-go-pq
Incremental: s3://saturn-elsevierinc/incremental/add-ents-go
Merged: s3://saturn-elsevierinc/2020-09-28/cord19-ents-go-pq


In [94]:
ent_full_df = dd.read_parquet(ent_full_folder, engine="pyarrow")
ent_inc_df = dd.read_parquet(ent_inc_folder, engine="pyarrow")

In [95]:
ent_df = delete_diff_rows(ent_full_df, del_df)
ent_df = add_diff_rows(ent_df, ent_inc_df)
ent_df = set_entity_dtypes(ent_df)

In [96]:
if fs.exists(ent_merged_folder):
    fs.rm(ent_merged_folder, recursive=True)

In [97]:
%%time
ent_df = ent_df.repartition(partition_size="20MB")

CPU times: user 385 ms, sys: 27.1 ms, total: 412 ms
Wall time: 38.4 s


In [98]:
%%time
ent_df.to_parquet(ent_merged_folder, engine="pyarrow", compression="snappy")

CPU times: user 1.43 s, sys: 55.5 ms, total: 1.49 s
Wall time: 48.9 s


In [99]:
fs.du(ent_merged_folder) / 1e6

901.414234

In [100]:
ent_df = dd.read_parquet(ent_merged_folder, engine="pyarrow")
ent_df.head()

Unnamed: 0,cord_uid,pid,sid,eid,eclass,etext,elabel,escore,ent_start_char,ent_end_char,title,abstract,pdf_json_files,pmc_json_files,_merge
0,22veehj5,B0,0,1,go,infection,C0042769,0.806184,29,38,,,,,left_only
1,22veehj5,B0,0,10,go,female,C2610196,0.813911,184,190,,,,,left_only
2,22veehj5,B0,0,10,go,female,C1155790,0.783489,184,190,,,,,left_only
3,22veehj5,B0,0,10,go,female,C1154363,0.759302,184,190,,,,,left_only
4,22veehj5,B0,0,10,go,female,C1155791,0.746576,184,190,,,,,left_only


In [101]:
len(ent_df)

58868804

## Entities (7): HPO

In [102]:
i = 7
ent_full_folder = ENT_FULL_FOLDER_T.format(MODEL_NAMES[i])
ent_inc_folder = ENT_INC_FOLDER_T.format(MODEL_NAMES[i])
ent_merged_folder = ENT_MERGED_FOLDER_T.format(MODEL_NAMES[i])

print("Input: {:s}".format(ent_full_folder))
print("Incremental: {:s}".format(ent_inc_folder))
print("Merged: {:s}".format(ent_merged_folder))

Input: s3://saturn-elsevierinc/cord19-ents-hpo-pq
Incremental: s3://saturn-elsevierinc/incremental/add-ents-hpo
Merged: s3://saturn-elsevierinc/2020-09-28/cord19-ents-hpo-pq


In [103]:
ent_full_df = dd.read_parquet(ent_full_folder, engine="pyarrow")
ent_inc_df = dd.read_parquet(ent_inc_folder, engine="pyarrow")

In [104]:
ent_df = delete_diff_rows(ent_full_df, del_df)
ent_df = add_diff_rows(ent_df, ent_inc_df)
ent_df = set_entity_dtypes(ent_df)

In [105]:
if fs.exists(ent_merged_folder):
    fs.rm(ent_merged_folder, recursive=True)

In [106]:
%%time
ent_df = ent_df.repartition(partition_size="20MB")

CPU times: user 2.09 s, sys: 48.6 ms, total: 2.14 s
Wall time: 33.7 s


In [107]:
%%time
ent_df.to_parquet(ent_merged_folder, engine="pyarrow", compression="snappy")

CPU times: user 2.85 s, sys: 56.2 ms, total: 2.9 s
Wall time: 37.6 s


In [108]:
fs.du(ent_merged_folder) / 1e6

680.420858

In [109]:
ent_df = dd.read_parquet(ent_merged_folder, engine="pyarrow")
ent_df.head()

Unnamed: 0,cord_uid,pid,sid,eid,eclass,etext,elabel,escore,ent_start_char,ent_end_char,title,abstract,pdf_json_files,pmc_json_files,_merge
0,zzkkm496,A0,0,0,hpo,development,C0557874,0.755285,4,15,,,,,left_only
1,zzkkm496,A0,0,0,hpo,development,C4020789,0.740521,4,15,,,,,left_only
2,zzkkm496,A0,0,0,hpo,development,C1848980,0.715946,4,15,,,,,left_only
3,zzkkm496,A0,0,0,hpo,development,C0239174,0.709348,4,15,,,,,left_only
4,zzkkm496,A0,0,3,hpo,improved,C0332161,0.884068,68,76,,,,,left_only


In [110]:
len(ent_df)

45073224

## Entities (8): RxNorm

In [111]:
i = 8
ent_full_folder = ENT_FULL_FOLDER_T.format(MODEL_NAMES[i])
ent_inc_folder = ENT_INC_FOLDER_T.format(MODEL_NAMES[i])
ent_merged_folder = ENT_MERGED_FOLDER_T.format(MODEL_NAMES[i])

print("Input: {:s}".format(ent_full_folder))
print("Incremental: {:s}".format(ent_inc_folder))
print("Merged: {:s}".format(ent_merged_folder))

Input: s3://saturn-elsevierinc/cord19-ents-rxnorm-pq
Incremental: s3://saturn-elsevierinc/incremental/add-ents-rxnorm
Merged: s3://saturn-elsevierinc/2020-09-28/cord19-ents-rxnorm-pq


In [112]:
ent_full_df = dd.read_parquet(ent_full_folder, engine="pyarrow")
ent_inc_df = dd.read_parquet(ent_inc_folder, engine="pyarrow")

In [113]:
ent_df = delete_diff_rows(ent_full_df, del_df)
ent_df = add_diff_rows(ent_df, ent_inc_df)
ent_df = set_entity_dtypes(ent_df)

In [114]:
if fs.exists(ent_merged_folder):
    fs.rm(ent_merged_folder, recursive=True)

In [115]:
%%time
ent_df = ent_df.repartition(partition_size="20MB")

CPU times: user 1.96 s, sys: 45.6 ms, total: 2.01 s
Wall time: 13.7 s


In [116]:
%%time
ent_df.to_parquet(ent_merged_folder, engine="pyarrow", compression="snappy")

CPU times: user 1.91 s, sys: 32.2 ms, total: 1.95 s
Wall time: 14.1 s


In [117]:
fs.du(ent_merged_folder) / 1e6

2.541795

In [118]:
ent_df = dd.read_parquet(ent_merged_folder, engine="pyarrow")
ent_df.head()

Unnamed: 0,cord_uid,pid,sid,eid,eclass,etext,elabel,escore,ent_start_char,ent_end_char,title,abstract,pdf_json_files,pmc_json_files,_merge
0,zzkkm496,B9,0,0,rxnorm,Excluded,C0332196,1.0,0,8,,,,,left_only
1,zzkkm496,B27,1,0,rxnorm,Heterogeneity,C0242960,1.0,0,13,,,,,left_only
2,zwp2ujli,B13,1,0,rxnorm,Colon cancer,C0007102,1.0,0,12,,,,,left_only
3,td2uk2wc,A0,0,0,rxnorm,Pulmonary fibrosis,C0034069,1.0,0,18,,,,,left_only
4,b0tlco4t,T,0,1,rxnorm,Severe,C0205082,1.0,12,18,,,,,left_only


In [119]:
len(ent_df)

203230

## Clean Up

In [120]:
# do this if you are done using the cluster
cluster.close()

distributed.client - ERROR - Failed to reconnect to scheduler after 10.00 seconds, closing client
_GatheringFuture exception was never retrieved
future: <_GatheringFuture finished exception=CancelledError()>
concurrent.futures._base.CancelledError
