# Split Paragraphs to Sentences

Input to this notebook is the output of the `01-load-and-parse.ipynb` notebook. We use the sentence splitter in the SciSpacy English Medium language model to split the paragraphs retrieved from the JSON files to sentences. At the end of this exercise, we should have a dataframe that contains the (`cord_uid`, `pid`, `sid`, `stext`).

In [1]:
import dask.dataframe as dd
import json
import numpy as np
import pandas as pd
import spacy
import s3fs
import scispacy

from dask.distributed import Client, progress, get_worker

In [2]:
BUCKET_NAME = "saturn-elsevierinc"

PARAGRAPH_FOLDER = "/".join(["s3:/", BUCKET_NAME, "cord19-paras-pq-sm"])
SENTENCE_FOLDER = "/".join(["s3:/", BUCKET_NAME, "cord19-sents-pq-sm"])

In [3]:
paragraph_df = dd.read_parquet(PARAGRAPH_FOLDER, engine="pyarrow")
paragraph_df = paragraph_df.dropna()
paragraph_df.head(npartitions=10)

Unnamed: 0,cord_uid,pid,ptext
6498,sz7qmi8q,T,
6498,sz7qmi8q,A0,Schwer punkt: Lun gen-und Pleura pa tho lo gie...
6498,sz7qmi8q,A1,In traal veolä re Ak ku mu la ti on von SP-A I...
6498,sz7qmi8q,A2,In traal veolä re Ak ku mu la ti on von pro SP...
6498,sz7qmi8q,B0,Die hi sto pa tho lo gi sche Un ter su chung v...


In [4]:
len(paragraph_df)

1215

## Processing

In [5]:
client = Client(processes=False, n_workers=2, threads_per_worker=1)
client

Perhaps you already have a cluster running?
Hosting the HTTP server on port 43839 instead
  http_address["port"], self.http_server.port


0,1
Client  Scheduler: inproc://10.0.0.177/586/1  Dashboard: http://10.0.0.177:43839/status,Cluster  Workers: 2  Cores: 2  Memory: 16.25 GB


In [6]:
def chunk_large_text(large_text, chunk_size, max_batch_size):
    text_len = len(large_text)
    start, end, texts = 0, 0, []
    while start < text_len:
        try:
            end = start + text[start : start + chunk_size].rindex('.')
            texts.append(text[start : end + 1])
            if end == start or len(texts) >= max_batch_size:
                break
            start = end + 1
        except:
            break
    if start < text_len and len(texts) < max_batch_size:
        texts.append(text[start:])
    return texts
            

def handle_row(row, nlp, max_text_length=1000000, max_batch_size=10):
    # max_text_length is a SpaCy limit
    ptext = row.ptext
    if len(ptext) > max_text_length:
        texts = chunk_large_text(
            ptext, max_text_length * 0.9, max_batch_size)
    else:
        texts = [ptext]
    sents = []
    for doc in nlp.pipe(texts):
        for sid, sent in enumerate(doc.sents):
            sents.append((sid, sent.text))
    return sents


def handle_partition(part):
    worker = get_worker()
    try:
        nlp = worker.nlp
    except:
        nlp = spacy.load("en_core_sci_md", disable=["tagger", "ner"])
        worker.nlp = nlp
    sentences = part.apply(lambda rows: handle_row(rows, nlp), 
                           axis=1)
    return sentences

In [7]:
# TODO: revisit?
# sentence_df = paragraph_df.repartition(npartitions=20)
sentence_df = paragraph_df.copy()

In [8]:
# sentence_df.head()

In [9]:
sentence_df["sentences"] = sentence_df.map_partitions(
    lambda part: handle_partition(part), meta=("object"))
sentence_df = sentence_df.drop(columns=["ptext"])
sentence_df = sentence_df.explode("sentences")
sentence_df = sentence_df.dropna()
sentence_df["sid"] = sentence_df.apply(
    lambda row: row.sentences[0], meta=("int"), axis=1)
sentence_df["stext"] = sentence_df.apply(
    lambda row: row.sentences[1], meta=("str"), axis=1)
sentence_df = sentence_df.drop(columns=["sentences"])

In [10]:
sentence_df.cord_uid = sentence_df.cord_uid.astype(str)
sentence_df.pid = sentence_df.pid.astype(str)
sentence_df.sid = sentence_df.sid.astype(np.int32)
sentence_df.stext = sentence_df.stext.astype(str)

In [11]:
fs = s3fs.S3FileSystem()
if fs.exists(SENTENCE_FOLDER):
    fs.rm(SENTENCE_FOLDER, recursive=True)

In [12]:
sentence_df.to_parquet(SENTENCE_FOLDER, engine="pyarrow", compression="snappy")

## Verify Result

In [13]:
sentence_df = dd.read_parquet(SENTENCE_FOLDER, engine="pyarrow")

sentence_df.head(npartitions=10)

Unnamed: 0,cord_uid,pid,sid,stext
6498,sz7qmi8q,A0,0,Schwer punkt: Lun gen-und Pleura pa tho lo gie...
6498,sz7qmi8q,A0,1,für Pa tho lo gie der Ruhr-Uni ver si tät Bo c...
6498,sz7qmi8q,A0,2,fi ka ti on der Er kran kun gen des pul mo
6498,sz7qmi8q,A0,3,na len Sur fac tant-Sys tems TYP I TYP II TYP ...
6498,sz7qmi8q,A0,4,re nenund Säug lings al ter Neu ge bo


In [14]:
len(sentence_df)

7313