In [3]:
import spacy
from spacy import displacy
nlp = spacy.load("en_core_web_lg")

def load_doc(path):
    with open(path, 'r') as doc:
        return doc.read()

text = load_doc('data/zoom_424b4.htm.txt')
doc = nlp(text)

In [None]:
import re

def filter_spans(spans):
    # Filter a sequence of spans so they don't contain overlaps
    # For spaCy 2.1.4+: this function is available as spacy.util.filter_spans()
    get_sort_key = lambda span: (span.end - span.start, -span.start)
    sorted_spans = sorted(spans, key=get_sort_key, reverse=True)
    result = []
    seen_tokens = set()
    for span in sorted_spans:
        # Check for end - 1 here because boundaries are inclusive
        if span.start not in seen_tokens and span.end - 1 not in seen_tokens:
            result.append(span)
        seen_tokens.update(range(span.start, span.end))
    result = sorted(result, key=lambda span: span.start)
    return result

def extract_currency_relations(doc):
    # Merge entities and noun chunks into one token
    spans = list(doc.ents) + list(doc.noun_chunks)
    spans = filter_spans(spans)
    with doc.retokenize() as retokenizer:
        for span in spans:
            retokenizer.merge(span)

    relations = []
    for money in filter(lambda w: w.ent_type_ == "MONEY", doc):
        if money.dep_ in ("attr", "dobj"):
            subject = [w for w in money.head.lefts if w.dep_ == "nsubj"]
            if subject:
                subject = subject[0]
                relations.append((subject, money))
        elif money.dep_ == "pobj" and money.head.dep_ == "prep":
            relations.append((money.head.head, money))
    return relations

def contains_info(sent):
    if ('revenue') in sent.text.lower():
        for ent in sent.ents:            
            if ent.label_ in ('DATE', 'TIME', 'PERCENT', 'MONEY'):
                return True
        return False
    return False

def clean_text(text):
    text = text.strip()
    return re.sub(r"(\s+)",r" ",text)

g = (clean_text(s.text) for s in doc.sents if contains_info(s))

In [None]:
for i in range(10):
    sent = nlp(next(g))
    relations = extract_currency_relations(sent)
    for r1, r2 in relations:
        print(sent)
        print("{:<10}\t{}\t{}".format(r1.text, r2.ent_type_, r2.text))
        print('\n')

In [None]:
print("Sentence:", sent)

In [None]:
for chunk in sent.noun_chunks:
    print(chunk.text)

In [None]:
print("Entities in the sentence:")
for ent in sent.ents:
    print(ent.text, ent.label_)

In [None]:
displacy.render(sent, style="ent", jupyter=True)

In [None]:
for token in sent:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)