# Split Paragraphs to Sentences

Input to this notebook is the output of the `01-load-and-parse.ipynb` notebook. We use the sentence splitter in the SciSpacy English Medium language model to split the paragraphs retrieved from the JSON files to sentences. At the end of this exercise, we should have a dataframe that contains the (`cord_uid`, `pid`, `sid`, `stext`).

## Initialize Dask Cluster

In [1]:
from dask.distributed import Client, wait
from dask_saturn import SaturnCluster
import time

n_workers = 10
cluster = SaturnCluster(n_workers=n_workers, scheduler_size='2xlarge', worker_size='4xlarge', nthreads=16)
client = Client(cluster)
cluster

[2020-09-17 15:23:12] INFO - dask-saturn | Starting cluster. Status: pending
[2020-09-17 15:23:17] INFO - dask-saturn | Starting cluster. Status: pending
[2020-09-17 15:23:35] INFO - dask-saturn | Starting cluster. Status: pending
[2020-09-17 15:24:01] INFO - dask-saturn | Starting cluster. Status: pending
[2020-09-17 15:25:01] INFO - dask-saturn | Starting cluster. Status: pending
[2020-09-17 15:25:55] INFO - dask-saturn | Starting cluster. Status: pending
[2020-09-17 15:26:30] INFO - dask-saturn | Starting cluster. Status: pending
[2020-09-17 15:27:13] INFO - dask-saturn | Starting cluster. Status: pending
[2020-09-17 15:27:51] INFO - dask-saturn | Cluster is ready


VBox(children=(HTML(value='<h2>SaturnCluster</h2>'), HBox(children=(HTML(value='\n<div>\n  <style scoped>\n   …

In [2]:
while len(client.scheduler_info()['workers']) < n_workers:
    print('Waiting for workers, got', len(client.scheduler_info()['workers']))
    time.sleep(30)
print('Done!')

Waiting for workers, got 0
Waiting for workers, got 0
Waiting for workers, got 0
Waiting for workers, got 0
Waiting for workers, got 0
Waiting for workers, got 0
Waiting for workers, got 0
Waiting for workers, got 1
Done!


## Processing

In [3]:
import dask.dataframe as dd
import json
import numpy as np
import pandas as pd
import spacy
import s3fs
import scispacy

from dask.distributed import Client, progress, get_worker

In [4]:
BUCKET_NAME = "saturn-elsevierinc"

PARAGRAPH_FOLDER = "/".join(["s3:/", BUCKET_NAME, "cord19-paras-pq"])
SENTENCE_FOLDER = "/".join(["s3:/", BUCKET_NAME, "cord19-sents-pqr"])

In [5]:
paragraph_df = dd.read_parquet(PARAGRAPH_FOLDER, engine="pyarrow")
paragraph_df = paragraph_df.dropna()
paragraph_df.head(npartitions=10)

Unnamed: 0,cord_uid,pid,ptext
0,ug7v899j,T,Clinical features of culture-proven Mycoplasma...
0,ug7v899j,A0,Objective: This retrospective chart review des...
0,ug7v899j,A1,Patients with positive M. pneumoniae cultures ...
0,ug7v899j,A2,"Results: 40 patients were identified, 33 (82.5..."
0,ug7v899j,A3,Conclusion: our results were similar to publis...


In [6]:
len(paragraph_df)

3389009

In [7]:
def chunk_large_text(large_text, chunk_size, max_batch_size):
    text_len = len(large_text)
    start, end, texts = 0, 0, []
    while start < text_len or len(texts) >= max_batch_size:
        try:
            end = start + text[start : start + chunk_size].rindex('.')
            texts.append(text[start : end + 1])
            if end == start:
                break
            start = end + 1
        except:
            break
    if start < text_len and len(texts) < max_batch_size:
        texts.append(text[start:])
    return texts
            

def handle_row(row, nlp, max_text_length=1000000, max_batch_size=10):
    # max_text_length is a SpaCy limit
    ptext = row.ptext
    if len(ptext) > max_text_length:
        texts = chunk_large_text(
            ptext, max_text_length * 0.9, max_batch_size)
    else:
        texts = [ptext]
    sents = []
    for doc in nlp.pipe(texts, n_threads=16, batch_size=len(texts)):
        for sid, sent in enumerate(doc.sents):
            sents.append((sid, sent.text))
    return sents


def handle_partition(part):
    worker = get_worker()
    try:
        nlp = worker.nlp
    except:
        nlp = spacy.load("en_core_sci_md", disable=["tagger", "ner"])
        worker.nlp = nlp
    sentences = part.apply(lambda rows: handle_row(rows, nlp), 
                           axis=1)
    return sentences

In [8]:
# TODO: revisit?
# sentence_df = paragraph_df.repartition(npartitions=20)
sentence_df = paragraph_df.copy()

In [9]:
# sentence_df.head()

In [10]:
sentence_df["sentences"] = sentence_df.map_partitions(
    lambda part: handle_partition(part), meta=("object"))
sentence_df = sentence_df.drop(columns=["ptext"])
sentence_df = sentence_df.explode("sentences")
sentence_df = sentence_df.dropna()
sentence_df["sid"] = sentence_df.apply(
    lambda row: row.sentences[0], meta=("int"), axis=1)
sentence_df["stext"] = sentence_df.apply(
    lambda row: row.sentences[1], meta=("str"), axis=1)
sentence_df = sentence_df.drop(columns=["sentences"])

In [11]:
sentence_df.cord_uid = sentence_df.cord_uid.astype(str)
sentence_df.pid = sentence_df.pid.astype(str)
sentence_df.sid = sentence_df.sid.astype(np.int32)
sentence_df.stext = sentence_df.stext.astype(str)

In [12]:
fs = s3fs.S3FileSystem()
if fs.exists(SENTENCE_FOLDER):
    fs.rm(SENTENCE_FOLDER, recursive=True)

In [13]:
# sentence partitions are not evenly sized which can cause problems
# with downstream tasks, so repartitioning for performance.
# check with:
# part_len = sentences.map_partitions(len).compute()
# part_len.describe()
sentence_df = sentence_df.repartition(partition_size="20MB")

In [14]:
%%time
sentence_df.to_parquet(SENTENCE_FOLDER, engine="pyarrow", compression="snappy")

CPU times: user 3.12 s, sys: 303 ms, total: 3.42 s
Wall time: 1h 2min 23s


## Verify Result

In [15]:
SENTENCE_FOLDER

's3://saturn-elsevierinc/cord19-sents-pqr'

In [16]:
fs.du(SENTENCE_FOLDER) / 1e6

1409.544747

In [17]:
sentence_df = dd.read_parquet(SENTENCE_FOLDER, engine="pyarrow")
sentence_df.head()

Unnamed: 0,cord_uid,pid,sid,stext
0,ug7v899j,T,0,Clinical features of culture-proven Mycoplasma...
0,ug7v899j,A0,0,Objective: This retrospective chart review des...
0,ug7v899j,A1,0,Patients with positive M. pneumoniae cultures ...
0,ug7v899j,A1,1,Charts of patients were reviewed.
0,ug7v899j,A2,0,"Results: 40 patients were identified, 33 (82.5..."


In [18]:
len(sentence_df)

16952279

In [19]:
# do this if youre done using the cluster
cluster.close()