In [None]:
import timeit
from IPython.core.interactiveshell import InteractiveShell
import numpy as np
from multiprocessing import cpu_count
import spacy
import pandas as pd
import glob
import re
from bs4 import BeautifulSoup
import uuid

import matplotlib.pyplot as plt
import seaborn as sns
import seaborn as sns
from matplotlib import rcParams

rcParams["figure.figsize"] = 8, 6
sns.set_style("darkgrid")
sns.set_palette("pastel", 12)

nlp = spacy.load('en_core_web_md')

InteractiveShell.ast_node_interactivity = "all"

In [None]:
def remove_excess_char(string):
    # new line/tab delimeters and barstad ascii variants
    replacements = [
        ("\n", " "),
        ("\r", " "),
        ("\t", " "),
        ("\\n", " "),
        ("\\r", " "),
        ("\\t", " "),
    ]
    for e in replacements:
        string = string.replace(*e)
    return ' '.join(string.split())

def replace_malformed_hex(string):
    string = string.replace("\\", " ")
    string = re.sub("x[0-9]{2,3}", " ", string)
    string = re.sub("xe[0-9]", " ", string)
    return string

def multi_process_spacy_docs(texts, nlp, n_process=None, batch_size=256):
    if not n_process:
        n_process = cpu_count()
    # 1.0 ensure that multi-processing isn't used frivolously
    num_docs = len(texts)
    if num_docs <= 100:
        return [nlp(e) for e in texts]

    # 2.0 batch documents, ensure against memory overflows
    iteration_size = batch_size * n_process
    total_iterations = int(len(texts) / iteration_size)

    if total_iterations < 1:
        # redefine batch size to ensure best spread across CPU cores
        optimal_batch_size = int(len(texts) / n_process)
        return list(
            nlp.pipe(texts, batch_size=optimal_batch_size, n_process=n_process)
        )
    else:
        # otherwise, iterate through large chunks of documents
        iteration_splits = np.array_split(texts, total_iterations)
        docs = []
        for split in iteration_splits:
            docs.extend(
                list(nlp.pipe(split, batch_size=batch_size, n_process=n_process))
            )
        return docs
    
def downsample_frame(df, n):
    # either n or max records in frame
    if df.shape[0] < n:
        return df
    else:
        return df.sample(n=n, random_state=42)

## Vis and Table Plotting

In [None]:
doc = nlp('otso is a machine learning company that specialises in the analysis of unstructured text data using state of the art natural language processing and artificial intelligence technology.')

In [None]:
# get token frame
pd.DataFrame.from_records([{'text': e.text, 'start_char': e.idx, 'end_char': e.idx +
                            len(e), 'is_digit': e.is_digit, 'is_punct': e.is_punct} for e in doc])

In [None]:
# render dependency parse
from spacy import displacy
doc = nlp('otso makes it easy to analyse all of your customer feedback and media mentions, from any source, at scale.')
displacy.render(doc, style='dep', jupyter=True, options={'distance': 80, 'collapse_phrases': True, 'bg':'#ffffff'})

In [None]:
# get pos frame
pd.DataFrame.from_records([{'text': e.text, 'pos': e.pos_,
                            'lemma': e.lemma_, 'embedding_sentence': e.sent} for e in doc]).head(10)

In [None]:
list(doc.noun_chunks)

In [None]:
# get noun chunk frame
pd.DataFrame.from_records([{'noun_chunk': e.text, 'root': e.root.text,
                            'root_lemma': e.root.lemma_} for e in doc.noun_chunks]).head(10)

In [None]:
large_doc = ' '.join(["With a lot of machine learning providers, it can feel like there's not a lot of room for flexibility, or specialisation to suit your needs. We built otso to address many of the shortfalls we saw in existing natural language systems, meaning it is built to work with a range of different use-cases, and can also be tuned and specialised to suit almost any natural language need.",
                      "otso makes it easy to analyse all of your customer feedback and media mentions, from any source, at scale. Discover new insights and explore relationships within your world of data, powered by the latest advances in AI.",
                      "otso can ingest your data in many different ways. Simply drag and drop your data files, integrate with external data partners, or work with our team to build a custom solution.",
                      ])

In [None]:
doc = nlp(large_doc)
displacy.render(doc, style='ent', jupyter=True, options={'distance': 80, 'collapse_phrases': True, 'bg':'#ffffff'})

In [None]:
def fan_ents_single(doc):
    # fan/tabulate all eities within a single doc
    entities = []
    for e in doc.ents:
        # collect eity annotations
        entities.append(
            {
                "embedding_sentence": e.sent.text,
                "entity": e.text,
                "entity_label": e.label_,
                "entity_lemma": e.lemma_.lower(),
                "entity_pos": e.root.pos_,
                "start": e.start_char - e.sent.start_char,
                "end": e.end_char - e.sent.start_char,
            }
        )
    return pd.DataFrame.from_records(entities)

In [None]:
fan_ents_single(doc)

In [None]:
# get noun chunk frame
pd.DataFrame.from_records([{'noun_chunk': e.text, 'root': e.root.text,
                            'root_lemma': e.root.lemma_} for e in doc.noun_chunks]).head(10)

## Benchmarking

In [None]:
def multi_process_spacy_docs(texts, nlp, n_process=None, batch_size=256):
    if not n_process:
        n_process = cpu_count()
    # 1.0 ensure that multi-processing isn't used frivolously
    num_docs = len(texts)
    if num_docs <= 100:
        return [nlp(e) for e in texts]

    # 2.0 batch documents, ensure against memory overflows
    iteration_size = batch_size * n_process
    total_iterations = int(len(texts) / iteration_size)

    if total_iterations < 1:
        # redefine batch size to ensure best spread across CPU cores
        optimal_batch_size = int(len(texts) / n_process)
        return list(
            nlp.pipe(texts, batch_size=optimal_batch_size, n_process=n_process)
        )
    else:
        # otherwise, iterate through large chunks of documents
        iteration_splits = np.array_split(texts, total_iterations)
        docs = []
        for split in iteration_splits:
            docs.extend(
                list(nlp.pipe(split, batch_size=batch_size, n_process=n_process))
            )
        return docs

In [None]:
df = (pd.concat([(pd.read_csv(e, usecols=['articleText', 'category', 'date'])) for e in glob.glob('../datasets/publications/**', recursive=True) if '.csv' in e], sort=True)
      .pipe(lambda x: x[x.articleText.apply(lambda y: True if type(y) == str else False)])
      # use a subset of documents to prototype
      .sample(n=5000, random_state=42)
      )

In [None]:
# benchmark wall times across different core configs
wall_times = []
for num_cores in [8, 16, 32]:
    before = timeit.default_timer()
    docs = multi_process_spacy_docs(
        df.head(5000).articleText, nlp, n_process=num_cores, batch_size=512)
    after = timeit.default_timer()
    wall_times.append({'cores': num_cores, 'wall_time': after - before})
    print('finished ', num_cores)
    
rcParams["figure.figsize"] = 8, 8
sns.lineplot(x='cores', y='wall_time', data=pd.DataFrame.from_records(wall_times))
plt.title('Effect of Multi-core Spacy Pipe Operations, 5000 Documents')

In [None]:
# benchmark wall times across different pipeline configs
pipeline_configs = {'tokenisation_ner_parser_tagger': [],
                    'tokenisation_ner_parser': ['tagger'],
                    'tokenisation_ner': ['tagger', 'parser'],
                    'tokenisation': ['tagger', 'parser', 'ner']}
wall_times = []

for k, v in pipeline_configs.items():
    before = timeit.default_timer()
    docs = list(nlp.pipe(df.head(2000).articleText,
                    batch_size=32, n_process=16, disable=v))
    after = timeit.default_timer()
    wall_times.append({'pipeline_config': k, 'wall_time': after - before})
    print('finished', k)

In [None]:
rcParams["figure.figsize"] = 8, 8
sns.lineplot(x='pipeline_config', y='wall_time', data=pd.DataFrame.from_records(wall_times))
plt.title('Effect of Simplifying Spacy Pipe Operations, 2000 Documents, 16 Cores')