In [None]:
%%capture
import sys
!{sys.executable} -m pip install -r requirements.txt
# install an English Spacy model
!{sys.executable} -m spacy download en_core_web_sm
%env TOKENIZERS_PARALLELISM=true

In [None]:
%%capture
# import libraries that we'll need
import os, re
import spacy
from spacy import displacy
from spacy.matcher import Matcher
import textacy
from textacy import preprocessing
from os.path import isfile, join
from functools import partial

import difflib

import contextualSpellCheck

# Textract can extract text from a variety of formats, including images
# (although OCR results might be a little dodgy)
import textract

nlp = spacy.load("en_core_web_sm")
contextualSpellCheck.add_to_pipe(nlp)

matcher = Matcher(nlp.vocab)

In [None]:
# Constants
data_directory = "data"

# Preprocessing and cleaning

Define a preprocessing pipeline

In [None]:
preproc = preprocessing.make_pipeline(
    # Normalize words in text that have been split across lines by a hyphen for visual consistency
    # (aka hyphenated) by joining the pieces back together, sans hyphen and whitespace
    preprocessing.normalize.hyphenated_words,
    preprocessing.normalize.whitespace,
    partial(preprocessing.replace.urls, repl = "OOV_URL"),
    partial(preprocessing.normalize.repeating_chars, chars = " "),
    # Remove whitespace at the end of each line of the PDFs
    lambda x: re.sub("\s+", " ", x),
)

# Importing and converting corpus files

Reads all `.pdf` and `.docx` files from the `/data` folder.

In [None]:
to_load = [join(data_directory, f) for f in os.listdir(data_directory) if isfile(join(data_directory, f))]
unprocessed_docs = []

for file in to_load:
    raw_text = textract.process(file).decode()
    unprocessed_docs.append(preproc(raw_text))

processed_docs = nlp.pipe(unprocessed_docs)
processed_docs = list(processed_docs)

# Show a small preview for each document in the corpus
[doc._.preview for doc in processed_docs]

Show all `ContentexualSpellcheck` suggestions. They're made by an ML model, and are sometimes very strange. It tries to correct all words that the spacy model hasn't seen before based on their surrounding context. None of the corrections have been applied to the text in this notebook, but it is not hard to do.

In [None]:
doc._.suggestions_spellCheck

# Visualisations

In [None]:
# Grab first document
doc = processed_docs[1]
sentence_spans = list(doc.sents)

## Named entitities

In [None]:
# only showing first 200 tokens for convienience
displacy.render(doc[:200], style="ent")

## Dependancy parse treesentence_spans

In [None]:
# first sentence only
displacy.render(sentence_spans[0], style="dep", options = { "compact": True })

# Ngrams

List all bi- and trigrams that occur at least twice

In [None]:
ngrams = list(textacy.extract.basics.ngrams(doc, (2, 3), min_freq = 2))

# list(ngrams)[0].text

dir(ngrams[0])

# Pattern matching

Find all instances of adjective + noun

In [None]:
# your patterns
patterns = {
    "adj_noun": [{"POS": "ADJ"}, {"POS": "NOUN"}],
    "adj_propn": [{"POS": "ADJ"}, {"POS": "PROPN"}],
}

# add the patterns to the matcher
for pattern_name, pattern in patterns.items():
    matcher.add(pattern_name, [pattern])
    
matches = matcher(doc)

for match_id, start, end in matches:
    # Get string representation
    pattern_name = nlp.vocab.strings[match_id]
    # Get text of match
    span = doc[start:end]
    print(f"{span.text} -- {pattern_name}\n")
