In [None]:
from IPython.core.interactiveshell import InteractiveShell
import numpy as np
from multiprocessing import cpu_count
import spacy
import pandas as pd
import glob
import re
from bs4 import BeautifulSoup
from pandarallel import pandarallel
import uuid

import matplotlib.pyplot as plt
import seaborn as sns
import seaborn as sns
from matplotlib import rcParams

rcParams["figure.figsize"] = 8, 6
sns.set_style("darkgrid")
sns.set_palette("pastel", 12)

# pandarallel.initialize(shm_size_mb=2000, nb_workers=64)

nlp = spacy.load('en_core_web_md')

InteractiveShell.ast_node_interactivity = "all"

In [None]:
def remove_excess_char(string):
    # new line/tab delimeters and barstad ascii variants
    replacements = [
        ("\n", " "),
        ("\r", " "),
        ("\t", " "),
        ("\\n", " "),
        ("\\r", " "),
        ("\\t", " "),
    ]
    for e in replacements:
        string = string.replace(*e)
    return ' '.join(string.split())

def replace_malformed_hex(string):
    string = string.replace("\\", " ")
    string = re.sub("x[0-9]{2,3}", " ", string)
    string = re.sub("xe[0-9]", " ", string)
    return string

def multi_process_spacy_docs(texts, nlp, n_process=None, batch_size=256):
    if not n_process:
        n_process = cpu_count()
    # 1.0 ensure that multi-processing isn't used frivolously
    num_docs = len(texts)
    if num_docs <= 100:
        return [nlp(e) for e in texts]

    # 2.0 batch documents, ensure against memory overflows
    iteration_size = batch_size * n_process
    total_iterations = int(len(texts) / iteration_size)

    if total_iterations < 1:
        # redefine batch size to ensure best spread across CPU cores
        optimal_batch_size = int(len(texts) / n_process)
        return list(
            nlp.pipe(texts, batch_size=optimal_batch_size, n_process=n_process)
        )
    else:
        # otherwise, iterate through large chunks of documents
        iteration_splits = np.array_split(texts, total_iterations)
        docs = []
        for split in iteration_splits:
            docs.extend(
                list(nlp.pipe(split, batch_size=batch_size, n_process=n_process))
            )
        return docs
    
def downsample_frame(df, n):
    # either n or max records in frame
    if df.shape[0] < n:
        return df
    else:
        return df.sample(n=n, random_state=42)

## Preprocess articles

In [None]:
# %%time
df = (pd.concat([(pd.read_csv(e, usecols=['articleText', 'category', 'date'])
                  .pipe(lambda x: downsample_frame(x, 100))
                  ) for e in glob.glob('../datasets/publications/**', recursive=True) if '.csv' in e], sort=True)
      .pipe(lambda x: x[x.articleText.apply(lambda y: True if type(y) == str else False)])
      # use a subset of documents to prototype
      .sample(n=50000, random_state=42)
      # remove tags
      .assign(articleText=lambda x: x.articleText.parallel_apply(lambda y: BeautifulSoup(y, "lxml").text))
      # replace malformed hex chars
      .assign(articleText=lambda x: x.articleText.parallel_apply(replace_malformed_hex))
      # remove excessive chars
      .assign(articleText=lambda x: x.articleText.parallel_apply(remove_excess_char))
      # drop duplicates on articleText
      .drop_duplicates(subset=['articleText'])
      # get doc len
      .assign(doc_len=lambda x: x.articleText.parallel_apply(lambda y: len(y.split(' '))))
      # assign spacy doc
      .assign(spacy_doc=lambda x: multi_process_spacy_docs(x.articleText, nlp, batch_size=128))
      .assign(doc_uuid=lambda x: [uuid.uuid4().hex for e in range(x.shape[0])])
      .assign(date=lambda x: x.date.parallel_apply(pd.to_datetime))
      # pull out some corpus-level stats
      .assign(num_sentences=lambda x: x.spacy_doc.apply(lambda y: len(list(y.sents))))
      .assign(num_tokens=lambda x: x.spacy_doc.apply(lambda y: len(list(y))))
      .assign(pub_year=lambda x: x.date.apply(lambda y: y.year))
      )

## Create Entity and Noun Chunk Tables

In [None]:
def fan_ents_all(df, spacy_doc, doc_uuid):
    def fan_ents_single(doc, doc_uuid):
        # fan/tabulate all eities within a single doc
        entities = []
        for e in doc.ents:
            # collect eity annotations
            entities.append(
                {
                    "embedding_sentence": e.sent.text,
                    "entity": e.text,
                    "entity_label": e.label_,
                    "entity_lemma": e.lemma_.lower(),
                    "entity_pos": e.root.pos_,
                    "start": e.start_char - e.sent.start_char,
                    "end": e.end_char - e.sent.start_char,
                    "doc_uuid": doc_uuid,
                }
            )
        return pd.DataFrame.from_records(entities)

    return pd.concat(df.apply(lambda x: fan_ents_single(x.spacy_doc, x.doc_uuid), axis=1).tolist())

def fan_noun_chunks_all(df, spacy_doc, doc_uuid):
    # compile all noun chunks within a single spacy doc
    def fan_noun_chunks_single(doc, doc_uuid, min_num_tokens=2):
        noun_chunks = []
        for e in doc.noun_chunks:
            if len(e) >= min_num_tokens:
                noun_chunks.append(
                    {
                        "embedding_seence": e.sent.text,
                        "noun_chunk": e.text,
                        "root": e.root,
                        "root_lemma": e.root.lemma_.lower(),
                        "root_pos": e.root.pos_,
                        "doc_uuid": doc_uuid,
                    }
                )
        return pd.DataFrame.from_records(noun_chunks)

    return pd.concat(df.apply(lambda x: fan_noun_chunks_single(x.spacy_doc, x.doc_uuid), axis=1).tolist())

In [None]:
%%time
noun_chunk_frame = fan_noun_chunks_all(df, 'spacy_doc','doc_uuid')

In [None]:
%%time
entity_frame = fan_ents_all(df, 'spacy_doc', 'doc_uuid')

In [None]:
document_meta_information_frame = pd.read_csv('../datasets/cache/50000_docs_meta_information.csv')
entity_frame = pd.read_csv('../datasets/cache/50000_docs_entity_frame.csv')
noun_chunks_frame = pd.read_csv('../datasets/cache/50000_noun_chunk_frame.csv')

In [None]:
interesting_entity_types = ['PERSON',
                            'ORG',
                            'GPE',
                            'NORP',
                            'WORK_OF_ART',
                            'QUANTITY',
                            'PRODUCT',
                            'FAC',
                            'LOC',
                            'EVENT',
                            'LAW',
                           ]

pre_fed_doc_uuids = (document_meta_information_frame
                     .query('pub_year <= 1900')
                     .doc_uuid
                     )

# top 20 most mentioned PERSON entities (with some error..)
(entity_frame
 .pipe(lambda x: x[x.doc_uuid.isin(pre_fed_doc_uuids)])
#  .pipe(lambda x: x[x.entity_label.isin(interesting_entity_types)])
 .query('entity_label == "PERSON"')
 .groupby(by=['entity_label', 'entity_lemma'])
 .size()
 .sort_values(ascending=False)
 .reset_index().rename(columns={0: 'frequency'})
 .head(20)
 )

# compare/contrast with top 20 noun chunks
(noun_chunks_frame
 .pipe(lambda x: x[x.doc_uuid.isin(pre_fed_doc_uuids)])
 .groupby(by=['root_lemma'])
 .size()
 .sort_values(ascending=False)
 .reset_index().rename(columns={0: 'frequency'})
 .head(20)
 )

In [None]:
entity_frame.query('entity_lemma == "stabling"')

In [None]:
entity_frame.query('entity_lemma == "sheep"')