# Incremental Post-processor

This notebook uses the `incremental/added` and `incremental/deleted` data generated by `05ax-incremental-pre` and applies it to each of the outputs of the full pipeline.

* Run the entire pipeline on the paragraphs from the `incremental/added` file. This is the `incremental/[entity_name]` file.
* Read the `[entity_name]` file from previous run.
* Apply the deletions from the `incremental/deleted` files to `[output_entity_name]` file.
* Concatenate the additions from `incremental/[entity_name]` files.
* Repartition and write output file `[output_entity_name]`.

## Initialize Dask Cluster

In [None]:
from dask_saturn.core import describe_sizes

describe_sizes()

In [None]:
from dask.distributed import Client, wait
from dask_saturn import SaturnCluster
import time

n_workers = 10
cluster = SaturnCluster(n_workers=n_workers, scheduler_size='2xlarge', worker_size='4xlarge', nthreads=16)
client = Client(cluster)
cluster

In [None]:
while len(client.scheduler_info()['workers']) < n_workers:
    print('Waiting for workers, got', len(client.scheduler_info()['workers']))
    time.sleep(30)
print('Done!')

## Setup

In [None]:
import boto3
import dask.dataframe as dd
import json
import numpy as np
import os
import pandas as pd
import s3fs

from dask.distributed import Client, progress, get_worker

In [None]:
BUCKET_NAME = "s3://saturn-elsevierinc"

D1 = "2020-08-28"
D2 = "2020-09-28"

DIFF_ADD_FOLDER = "/".join([BUCKET_NAME, "incremental", "added"])
DIFF_DEL_FOLDER = "/".join([BUCKET_NAME, "incremental", "deleted"])

PARA_FULL_FOLDER = "/".join([BUCKET_NAME, "cord19-paras-pq"])
PARA_INC_FOLDER = "/".join([BUCKET_NAME, "incremental", "added"])
PARA_MERGED_FOLDER = "/".join([BUCKET_NAME, D2, "cord19-paras-pq"])

SENT_FULL_FOLDER = "/".join([BUCKET_NAME, "cord19-sents-pqr"])
SENT_INC_FOLDER = "/".join([BUCKET_NAME, "incremental", "add-sents"])
SENT_MERGED_FOLDER = "/".join([BUCKET_NAME, D2, "cord19-sents-pq"])

MODEL_NAMES = [
    "craft", "jnlpba", "bc5cdr", "bionlp",
    "umls", "mesh", "go", "hpo", "rxnorm"
]
ENT_FULL_FOLDER_T = "/".join([BUCKET_NAME, "cord19-ents-{:s}-pq")])
ENT_INC_FOLDER_T = "/".join([BUCKET_NAME, "incremental", "add-ents-{:s}"])
ENT_MERGED_FOLDER_T = "/".join([BUCKET_NAME, D2, "cord19-ents-{:s}-pq"])

In [None]:
fs = s3fs.S3FileSystem()

In [None]:
del_df = dd.read_parquet(DIFF_DEL_FOLDER, engine="pyarrow")
add_df = dd.read_parquet(DIFF_ADD_FOLDER, engine="pyarrow")

## Paragraphs

In [None]:
print("Input: {:s}".format(PARA_FULL_FOLDER))
print("Incremental: {:s}".format(PARA_INC_FOLDER))
print("Merged: {:s}".format(PARA_MERGED_FOLDER))

In [None]:
para_full_df = dd.read_parquet(PARA_FULL_FOLDER, engine="pyarrow")
para_inc_df = dd.read_parquet(PARA_INC_FOLDER, engine="pyarrow")

In [None]:
def delete_diff_rows(full_df, del_df):
    # remove rows listed in del_df from para_full_df
    result_df = full_df.merge(del_df, how="outer", indicator=True,
                              on=["cord_uid"], suffixes=["_d1", "_d2"])
    result_df = result_df[result_df._merge == "left_only"]

    # drop extra cols acquired from del_df, rename para_full_df cols
    drop_cols = [x for x in result_df.columns if x.endswith("_d2")]
    rename_cols = {x : x[0:-3] for x in result_df.columns if x.endswith("_d1")}
    result_df = result_df.drop(columns=drop_cols)
    result_df = result_df.rename(columns=rename_cols)

    return result_df


def add_diff_rows(full_df, inc_df):
    result_df = dd.concat([full_df, inc_df], axis=0, interleave_partitions=True)
    return result_df

In [None]:
para_df = delete_diff_rows(para_full_df, del_df)
para_df = add_diff_rows(para_df, para_inc_df)

para_df.cord_uid = para_df.cord_uid.astype(str)
para_df.pid = para_df.pid.astype(str)
para_df.ptext = para_df.ptext.astype(str)

In [None]:
if fs.exists(PARA_MERGED_FOLDER):
    fs.rm(PARA_MERGED_FOLDER, recursive=True)

In [None]:
%%time
para_df.to_parquet(PARA_MERGED_FOLDER, engine="pyarrow", compression="snappy")

In [None]:
fs.du(PARA_MERGED_FOLDER) / 1e6

In [None]:
para_df = dd.read_parquet(PARA_MERGED_FOLDER, engine="pyarrow")
para_df.head()

In [None]:
len(para_df)

## Sentences

In [None]:
print("Input: {:s}".format(SENT_FULL_FOLDER))
print("Incremental: {:s}".format(SENT_INC_FOLDER))
print("Merged: {:s}".format(SENT_MERGED_FOLDER))

In [None]:
sent_full_df = dd.read_parquet(SENT_FULL_FOLDER, engine="pyarrow")
sent_inc_df = dd.read_parquet(SENT_INC_FOLDER, engine="pyarrow")

In [None]:
sent_df = delete_diff_rows(sent_full_df, del_df)
sent_df = add_diff_rows(sent_df, sent_inc_df)

sent_df.cord_uid = sent_df.cord_uid.astype(str)
sent_df.pid = sent_df.pid.astype(str)
sent_df.sid = sent_df.sid.astype(np.int32)
sent_df.stext = sent_df.stext.astype(str)

In [None]:
if fs.exists(SENT_MERGED_FOLDER):
    fs.rm(SENT_MERGED_FOLDER, recursive=True)

In [None]:
%%time
sent_df.to_parquet(SENT_MERGED_FOLDER, engine="pyarrow", compression="snappy")

In [None]:
fs.du(SENT_MERGED_FOLDER) / 1e6

In [None]:
sent_df = dd.read_parquet(SENT_MERGED_FOLDER, engine="pyarrow")
sent_df.head()

In [None]:
len(sent_df)

## Entities(0): CRAFT

In [1]:
def set_entity_dtypes(entities_df):
    entities_df.cord_uid = entities_df.cord_uid.astype(str)
    entities_df.pid = entities_df.pid.astype(str)
    entities_df.sid = entities_df.sid.astype(np.int32)
    entities_df.eid = entities_df.eid.astype(np.int32)
    entities_df.eclass = entities_df.eclass.astype(str)
    entities_df.etext = entities_df.etext.astype(str)
    entities_df.elabel = entities_df.elabel.astype(str)
    entities_df.escore = entities_df.escore.astype(np.float32)
    entities_df.ent_start_char = entities_df.ent_start_char.astype(np.int32)
    entities_df.ent_end_char = entities_df.ent_end_char.astype(np.int32)
    return entities_df

In [None]:
i = 0
ent_full_folder = ENT_FULL_FOLDER_T.format(MODEL_NAMES[i])
ent_inc_folder = ENT_INC_FOLDER_T.format(MODEL_NAMES[i])
ent_merged_folder = ENT_MERGED_FOLDER_T.format(MODEL_NAMES[i])

print("Input: {:s}".format(ent_full_folder))
print("Incremental: {:s}".format(ent_inc_folder))
print("Merged: {:s}".format(ent_merged_folder))

In [None]:
ent_full_df = dd.read_parquet(ent_full_folder, engine="pyarrow")
ent_inc_df = dd.read_parquet(ent_inc_folder, engine="pyarrow")

In [None]:
ent_df = delete_diff_rows(ent_full_df, del_df)
ent_df = add_diff_rows(ent_df, ent_inc_df)
ent_df = set_entity_dtypes(ent_df)

In [None]:
if fs.exists(ent_merged_folder):
    fs.rm(ent_merged_folder, recursive=True)

In [None]:
%%time
ent_df.to_parquet(ent_merged_folder, engine="pyarrow", compression="snappy")

In [None]:
fs.du(ent_merged_folder) / 1e6

In [None]:
ent_df = dd.read_parquet(ent_merged_folder, engine="pyarrow")
ent_df.head()

In [None]:
len(ent_df)

## Entities(1): JNLPBA

In [None]:
i = 1
ent_full_folder = ENT_FULL_FOLDER_T.format(MODEL_NAMES[i])
ent_inc_folder = ENT_INC_FOLDER_T.format(MODEL_NAMES[i])
ent_merged_folder = ENT_MERGED_FOLDER_T.format(MODEL_NAMES[i])

print("Input: {:s}".format(ent_full_folder))
print("Incremental: {:s}".format(ent_inc_folder))
print("Merged: {:s}".format(ent_merged_folder))

In [None]:
ent_full_df = dd.read_parquet(ent_full_folder, engine="pyarrow")
ent_inc_df = dd.read_parquet(ent_inc_folder, engine="pyarrow")

In [None]:
ent_df = delete_diff_rows(ent_full_df, del_df)
ent_df = add_diff_rows(ent_df, ent_inc_df)
ent_df = set_entity_dtypes(ent_df)

In [None]:
if fs.exists(ent_merged_folder):
    fs.rm(ent_merged_folder, recursive=True)

In [None]:
%%time
ent_df.to_parquet(ent_merged_folder, engine="pyarrow", compression="snappy")

In [None]:
fs.du(ent_merged_folder) / 1e6

In [None]:
ent_df = dd.read_parquet(ent_merged_folder, engine="pyarrow")
ent_df.head()

In [None]:
len(ent_df)

## Entities(2): BC5CDR

In [None]:
i = 2
ent_full_folder = ENT_FULL_FOLDER_T.format(MODEL_NAMES[i])
ent_inc_folder = ENT_INC_FOLDER_T.format(MODEL_NAMES[i])
ent_merged_folder = ENT_MERGED_FOLDER_T.format(MODEL_NAMES[i])

print("Input: {:s}".format(ent_full_folder))
print("Incremental: {:s}".format(ent_inc_folder))
print("Merged: {:s}".format(ent_merged_folder))

In [None]:
ent_full_df = dd.read_parquet(ent_full_folder, engine="pyarrow")
ent_inc_df = dd.read_parquet(ent_inc_folder, engine="pyarrow")

In [None]:
ent_df = delete_diff_rows(ent_full_df, del_df)
ent_df = add_diff_rows(ent_df, ent_inc_df)
ent_df = set_entity_dtypes(ent_df)

In [None]:
if fs.exists(ent_merged_folder):
    fs.rm(ent_merged_folder, recursive=True)

In [None]:
%%time
ent_df.to_parquet(ent_merged_folder, engine="pyarrow", compression="snappy")

In [None]:
fs.du(ent_merged_folder) / 1e6

In [None]:
ent_df = dd.read_parquet(ent_merged_folder, engine="pyarrow")
ent_df.head()

In [None]:
len(ent_df)

## Entities(3): BioNLP

In [None]:
i = 3
ent_full_folder = ENT_FULL_FOLDER_T.format(MODEL_NAMES[i])
ent_inc_folder = ENT_INC_FOLDER_T.format(MODEL_NAMES[i])
ent_merged_folder = ENT_MERGED_FOLDER_T.format(MODEL_NAMES[i])

print("Input: {:s}".format(ent_full_folder))
print("Incremental: {:s}".format(ent_inc_folder))
print("Merged: {:s}".format(ent_merged_folder))

In [None]:
ent_full_df = dd.read_parquet(ent_full_folder, engine="pyarrow")
ent_inc_df = dd.read_parquet(ent_inc_folder, engine="pyarrow")

In [None]:
ent_df = delete_diff_rows(ent_full_df, del_df)
ent_df = add_diff_rows(ent_df, ent_inc_df)
ent_df = set_entity_dtypes(ent_df)

In [None]:
if fs.exists(ent_merged_folder):
    fs.rm(ent_merged_folder, recursive=True)

In [None]:
%%time
ent_df.to_parquet(ent_merged_folder, engine="pyarrow", compression="snappy")

In [None]:
fs.du(ent_merged_folder) / 1e6

In [None]:
ent_df = dd.read_parquet(ent_merged_folder, engine="pyarrow")
ent_df.head()

In [None]:
len(ent_df)

## Entities(4): UMLS

In [None]:
i = 4
ent_full_folder = ENT_FULL_FOLDER_T.format(MODEL_NAMES[i])
ent_inc_folder = ENT_INC_FOLDER_T.format(MODEL_NAMES[i])
ent_merged_folder = ENT_MERGED_FOLDER_T.format(MODEL_NAMES[i])

print("Input: {:s}".format(ent_full_folder))
print("Incremental: {:s}".format(ent_inc_folder))
print("Merged: {:s}".format(ent_merged_folder))

In [None]:
ent_full_df = dd.read_parquet(ent_full_folder, engine="pyarrow")
ent_inc_df = dd.read_parquet(ent_inc_folder, engine="pyarrow")

In [None]:
ent_df = delete_diff_rows(ent_full_df, del_df)
ent_df = add_diff_rows(ent_df, ent_inc_df)
ent_df = set_entity_dtypes(ent_df)

In [None]:
if fs.exists(ent_merged_folder):
    fs.rm(ent_merged_folder, recursive=True)

In [None]:
%%time
ent_df.to_parquet(ent_merged_folder, engine="pyarrow", compression="snappy")

In [None]:
fs.du(ent_merged_folder) / 1e6

In [None]:
ent_df = dd.read_parquet(ent_merged_folder, engine="pyarrow")
ent_df.head()

In [None]:
len(ent_df)

## Entities(5): MeSH

In [None]:
i = 5
ent_full_folder = ENT_FULL_FOLDER_T.format(MODEL_NAMES[i])
ent_inc_folder = ENT_INC_FOLDER_T.format(MODEL_NAMES[i])
ent_merged_folder = ENT_MERGED_FOLDER_T.format(MODEL_NAMES[i])

print("Input: {:s}".format(ent_full_folder))
print("Incremental: {:s}".format(ent_inc_folder))
print("Merged: {:s}".format(ent_merged_folder))

In [None]:
ent_full_df = dd.read_parquet(ent_full_folder, engine="pyarrow")
ent_inc_df = dd.read_parquet(ent_inc_folder, engine="pyarrow")

In [None]:
ent_df = delete_diff_rows(ent_full_df, del_df)
ent_df = add_diff_rows(ent_df, ent_inc_df)
ent_df = set_entity_dtypes(ent_df)

In [None]:
if fs.exists(ent_merged_folder):
    fs.rm(ent_merged_folder, recursive=True)

In [None]:
%%time
ent_df.to_parquet(ent_merged_folder, engine="pyarrow", compression="snappy")

In [None]:
fs.du(ent_merged_folder) / 1e6

In [None]:
ent_df = dd.read_parquet(ent_merged_folder, engine="pyarrow")
ent_df.head()

In [None]:
len(ent_df)

## Entities(6): GO 

In [None]:
i = 6
ent_full_folder = ENT_FULL_FOLDER_T.format(MODEL_NAMES[i])
ent_inc_folder = ENT_INC_FOLDER_T.format(MODEL_NAMES[i])
ent_merged_folder = ENT_MERGED_FOLDER_T.format(MODEL_NAMES[i])

print("Input: {:s}".format(ent_full_folder))
print("Incremental: {:s}".format(ent_inc_folder))
print("Merged: {:s}".format(ent_merged_folder))

In [None]:
ent_full_df = dd.read_parquet(ent_full_folder, engine="pyarrow")
ent_inc_df = dd.read_parquet(ent_inc_folder, engine="pyarrow")

In [None]:
ent_df = delete_diff_rows(ent_full_df, del_df)
ent_df = add_diff_rows(ent_df, ent_inc_df)
ent_df = set_entity_dtypes(ent_df)

In [None]:
if fs.exists(ent_merged_folder):
    fs.rm(ent_merged_folder, recursive=True)

In [None]:
%%time
ent_df.to_parquet(ent_merged_folder, engine="pyarrow", compression="snappy")

In [None]:
fs.du(ent_merged_folder) / 1e6

In [None]:
ent_df = dd.read_parquet(ent_merged_folder, engine="pyarrow")
ent_df.head()

In [None]:
len(ent_df)

## Entities(7): HPO

In [2]:
i = 7
ent_full_folder = ENT_FULL_FOLDER_T.format(MODEL_NAMES[i])
ent_inc_folder = ENT_INC_FOLDER_T.format(MODEL_NAMES[i])
ent_merged_folder = ENT_MERGED_FOLDER_T.format(MODEL_NAMES[i])

print("Input: {:s}".format(ent_full_folder))
print("Incremental: {:s}".format(ent_inc_folder))
print("Merged: {:s}".format(ent_merged_folder))

NameError: name 'ENT_FULL_FOLDER_T' is not defined

In [None]:
ent_full_df = dd.read_parquet(ent_full_folder, engine="pyarrow")
ent_inc_df = dd.read_parquet(ent_inc_folder, engine="pyarrow")

In [None]:
ent_df = delete_diff_rows(ent_full_df, del_df)
ent_df = add_diff_rows(ent_df, ent_inc_df)
ent_df = set_entity_dtypes(ent_df)

In [None]:
if fs.exists(ent_merged_folder):
    fs.rm(ent_merged_folder, recursive=True)

In [None]:
%%time
ent_df.to_parquet(ent_merged_folder, engine="pyarrow", compression="snappy")

In [None]:
fs.du(ent_merged_folder) / 1e6

In [None]:
ent_df = dd.read_parquet(ent_merged_folder, engine="pyarrow")
ent_df.head()

In [None]:
len(ent_df)

## Entities(8): RxNorm

In [None]:
i = 8
ent_full_folder = ENT_FULL_FOLDER_T.format(MODEL_NAMES[i])
ent_inc_folder = ENT_INC_FOLDER_T.format(MODEL_NAMES[i])
ent_merged_folder = ENT_MERGED_FOLDER_T.format(MODEL_NAMES[i])

print("Input: {:s}".format(ent_full_folder))
print("Incremental: {:s}".format(ent_inc_folder))
print("Merged: {:s}".format(ent_merged_folder))

In [None]:
ent_full_df = dd.read_parquet(ent_full_folder, engine="pyarrow")
ent_inc_df = dd.read_parquet(ent_inc_folder, engine="pyarrow")

In [None]:
ent_df = delete_diff_rows(ent_full_df, del_df)
ent_df = add_diff_rows(ent_df, ent_inc_df)
ent_df = set_entity_dtypes(ent_df)

In [None]:
if fs.exists(ent_merged_folder):
    fs.rm(ent_merged_folder, recursive=True)

In [None]:
%%time
ent_df.to_parquet(ent_merged_folder, engine="pyarrow", compression="snappy")

In [None]:
fs.du(ent_merged_folder) / 1e6

In [None]:
ent_df = dd.read_parquet(ent_merged_folder, engine="pyarrow")
ent_df.head()

In [None]:
len(ent_df)

## Clean up

In [None]:
# do this if youre done using the cluster
# cluster.close()