In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { margin-left: 2.5%; width:95% !important;}</style>"))

A simple example of extracting relations between phrases and entities using
spaCy's named entity recognizer and the dependency parse. Here, we extract
money and currency values (entities labelled as MONEY) and then check the
dependency tree to find the noun phrase they are referring to – for example:
$9.4 million --> Net income.

Compatible with: spaCy v2.0.0+

In [2]:
import plac
import spacy

In [5]:
nlp = spacy.load('en_core_web_lg')

In [3]:
TEXTS = [
    'Net income was $9.4 million compared to the prior year of $2.7 million.',
    'Revenue exceeded twelve billion dollars, with a loss of $1b.',
]

In [11]:
doc = nlp(TEXTS[0])

In [12]:
spacy.displacy.render(doc, jupyter=True)

In [13]:
spans = list(doc.ents) + list(doc.noun_chunks)
spans

[$9.4 million, the prior year, $2.7 million, Net income, the prior year]

In [14]:
for span in spans:
    span.merge()

In [15]:
spacy.displacy.render(doc, jupyter=True)

In [16]:
money_tokens = [x for x in doc if x.ent_type_ == 'MONEY']
money_tokens

[$9.4 million, $2.7 million]

In [17]:
one_token = money_tokens[0]

In [19]:
if one_token.dep_ in ('attr', 'dobj'):
    subject = [w for w in one_token.head.lefts if w.dep_ == 'nsubj']
    if subject:
        subject = subject[0]
        print(subject)
elif one_token.dep_ == 'pobj' and one_token.head.dep_ == 'prep':
    print(one_token.head.head)

Net income


In [20]:
one_token = money_tokens[1]
if one_token.dep_ in ('attr', 'dobj'):
    subject = [w for w in one_token.head.lefts if w.dep_ == 'nsubj']
    if subject:
        subject = subject[0]
        print(subject)
elif one_token.dep_ == 'pobj' and one_token.head.dep_ == 'prep':
    print(one_token.head.head)

the prior year


In [21]:
doc = nlp(TEXTS[1])

In [22]:
spacy.displacy.render(doc, jupyter=True)

In [23]:
spans = list(doc.ents) + list(doc.noun_chunks)
spans

[twelve billion dollars, 1b, Revenue, twelve billion dollars, a loss]

In [25]:
for span in spans:
    span.merge()

In [26]:
spacy.displacy.render(doc, jupyter=True)

In [27]:
money_tokens = [x for x in doc if x.ent_type_ == 'MONEY']
money_tokens

[twelve billion dollars, 1b]

In [28]:
one_token = money_tokens[0]

In [29]:
if one_token.dep_ in ('attr', 'dobj'):
    subject = [w for w in one_token.head.lefts if w.dep_ == 'nsubj']
    if subject:
        subject = subject[0]
        print(subject)
elif one_token.dep_ == 'pobj' and one_token.head.dep_ == 'prep':
    print(one_token.head.head)

Revenue


In [30]:
one_token = money_tokens[1]
if one_token.dep_ in ('attr', 'dobj'):
    subject = [w for w in one_token.head.lefts if w.dep_ == 'nsubj']
    if subject:
        subject = subject[0]
        print(subject)
elif one_token.dep_ == 'pobj' and one_token.head.dep_ == 'prep':
    print(one_token.head.head)

a loss


-----------------------

Wrapper function:

In [31]:
def extract_currency_relations(doc):
    # merge entities and noun chunks into one token
    spans = list(doc.ents) + list(doc.noun_chunks)
    for span in spans:
        span.merge()

    relations = []
    for money in filter(lambda w: w.ent_type_ == 'MONEY', doc):
        if money.dep_ in ('attr', 'dobj'):
            subject = [w for w in money.head.lefts if w.dep_ == 'nsubj']
            if subject:
                subject = subject[0]
                relations.append((subject, money))
        elif money.dep_ == 'pobj' and money.head.dep_ == 'prep':
            relations.append((money.head.head, money))
    return relations