<a href="https://colab.research.google.com/github/rahiakela/natural-language-processing-research-and-practice/blob/main/text-similarity-works/09_spacy_phrase_matcher.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Setup

Reference:

[Rule-based entity recognition](https://spacy.io/usage/rule-based-matching#entityruler)

In [None]:
!pip -q install spacy
!python -m spacy download en_core_web_sm

Just restart the colab environment.

In [77]:
import pandas as pd
import numpy as np
import re
import time
import os

import pdb

import spacy
from spacy.matcher import PhraseMatcher
from spacy.lang.en import English
from spacy.language import Language
from spacy.tokens import Span
from spacy.tokens import Doc

import nltk

##Efficient phrase matching

In [19]:
nlp = spacy.load("en_core_web_sm")
matcher = PhraseMatcher(nlp.vocab)

In [20]:
terms = ["Barack Obama", "Angela Merkel", "Washington, D.C."]

patterns = [nlp.make_doc(text) for text in terms]
matcher.add("TerminologyList", patterns)

In [21]:
doc = nlp("German Chancellor Angela Merkel and US President Barack Obama converse in the Oval Office inside the White House in Washington, D.C.")
matches = matcher(doc)

In [22]:
for match_id, start, end in matches:
  span = doc[start: end]
  print(span.text)

Angela Merkel
Barack Obama
Washington, D.C.


In [23]:
# Matched based on lowercase token
nlp = English()
matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
patterns = [nlp.make_doc(name) for name in ["Angela Merkel", "Barack Obama"]]
matcher.add("Names", patterns)

In [24]:
doc = nlp("angela merkel and us president barack Obama")

for match_id, start, end in matcher(doc):
  print("Matched based on lowercase token text:", doc[start:end])

Matched based on lowercase token text: angela merkel
Matched based on lowercase token text: barack Obama


https://stackoverflow.com/questions/67906945/valueerror-nlp-add-pipe-now-takes-the-string-name-of-the-registered-component-f

https://stackoverflow.com/questions/57187116/how-to-modify-spacy-tokens-doc-doc-tokens-with-pipeline-components-in-spacy

In [108]:
# Matched based on lowercase token
nlp = English()
matcher = PhraseMatcher(nlp.vocab)
terms = ["Barack Obama", "Angela Merkel", "Washington D C"]

patterns = [nlp.make_doc(text) for text in terms]
matcher.add("TerminologyList", patterns)

@Language.component("custom_matcher")
def extract_person_orgs(doc):
  token_list = []
  for index, token in enumerate(doc):
    # skip the loop if token contains "." or ","
    if token.text == '.' or token.text == ',':
      continue

    if "." in token.text:
      #print(token.text)
      token_list.append(token.text.replace(".", ""))
    else:
      token_list.append(token.text)
  #print(token_list)
  new_doc = Doc(doc.vocab, words=token_list)
  return new_doc

nlp.add_pipe("custom_matcher")

<function __main__.extract_person_orgs(doc)>

In [109]:
doc = nlp("German Chancellor Angela Merkel and US President Barack, Obama converse in the Oval Office inside the White House in Washington, D. C.")
matches = matcher(doc)

for match_id, start, end in matches:
  span = doc[start: end]
  print(span.text)

Angela Merkel
Barack Obama
Washington D C


In [25]:
nlp = English()
matcher = PhraseMatcher(nlp.vocab, attr="SHAPE")
matcher.add("IP", [nlp("127.0.0.1"), nlp("127.127.0.0")])

In [26]:
doc = nlp("Often the router will have an IP address such as 192.168.1.1 or 192.168.2.1.")
for match_id, start, end in matcher(doc):
  print("Matched based on token shape:", doc[start:end])

Matched based on token shape: 192.168.1.1
Matched based on token shape: 192.168.2.1


##Span ruler

In [None]:
nlp = spacy.blank("en")
ruler = nlp.add_pipe("span_ruler")
patterns = [
  {"label": "ORG", "pattern": "Apple"},
  {"label": "GPE", "pattern": [{"LOWER": "san"}, {"LOWER": "francisco"}]}
]
ruler.add_patterns(patterns)

In [None]:
doc = nlp("Apple is opening its first big office in San Francisco.")
print([(span.text, span.label_) for span in doc.spans["ruler"]])

[('Apple', 'ORG'), ('San Francisco', 'GPE')]


In [None]:
nlp = spacy.load("en_core_web_sm")

# only annotate doc.ents, not doc.spans
config = {"spans_key": None, "annotate_ents": True, "overwrite": False}

ruler = nlp.add_pipe("span_ruler", config=config)
patterns = [{"label": "ORG", "pattern": "MyCorp Inc."}]
ruler.add_patterns(patterns)

In [None]:
doc = nlp("MyCorp Inc. is a company in the U.S.")
print([(ent.text, ent.label_) for ent in doc.ents])

[('MyCorp Inc.', 'ORG'), ('U.S.', 'GPE')]


##Expanding named entities

In [29]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("Dr. Alex Smith chaired first board meeting of Acme Corp Inc.")
print([(ent.text, ent.label_) for ent in doc.ents])

[('Alex Smith', 'PERSON'), ('first', 'ORDINAL'), ('Acme Corp Inc.', 'ORG')]


In [32]:
nlp = spacy.load("en_core_web_sm")

@Language.component("expand_person_entities")
def expand_person_entities(doc):
    new_ents = []
    for ent in doc.ents:
        if ent.label_ == "PERSON" and ent.start != 0:
            prev_token = doc[ent.start - 1]
            if prev_token.text in ("Dr", "Dr.", "Mr", "Mr.", "Ms", "Ms."):
                new_ent = Span(doc, ent.start - 1, ent.end, label=ent.label)
                new_ents.append(new_ent)
        else:
            new_ents.append(ent)
    doc.ents = new_ents
    return doc

# Add the component after the named entity recognizer
nlp.add_pipe("expand_person_entities", after="ner")

doc = nlp("Dr. Alex Smith chaired first board meeting of Acme Corp Inc.")
print([(ent.text, ent.label_) for ent in doc.ents])

[('Dr. Alex Smith', 'PERSON'), ('first', 'ORDINAL'), ('Acme Corp Inc.', 'ORG')]


In [33]:
nlp = spacy.load("en_core_web_sm")

def get_person_title(span):
    if span.label_ == "PERSON" and span.start != 0:
        prev_token = span.doc[span.start - 1]
        if prev_token.text in ("Dr", "Dr.", "Mr", "Mr.", "Ms", "Ms."):
            return prev_token.text

# Register the Span extension as 'person_title'
Span.set_extension("person_title", getter=get_person_title)

doc = nlp("Dr Alex Smith chaired first board meeting of Acme Corp Inc.")
print([(ent.text, ent.label_, ent._.person_title) for ent in doc.ents])

[('Alex Smith', 'PERSON', 'Dr'), ('first', 'ORDINAL', None), ('Acme Corp Inc.', 'ORG', None)]


In [34]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("Alex Smith worked at Acme Corp Inc.")
print([(ent.text, ent.label_) for ent in doc.ents])

[('Alex Smith', 'PERSON'), ('Acme Corp Inc.', 'ORG')]


In [None]:
from spacy import displacy

nlp = spacy.load("en_core_web_sm")

@Language.component("extract_person_orgs")
def extract_person_orgs(doc):
    person_entities = [ent for ent in doc.ents if ent.label_ == "PERSON"]
    for ent in person_entities:
        head = ent.root.head
        if head.lemma_ == "work":
            preps = [token for token in head.children if token.dep_ == "prep"]
            for prep in preps:
                orgs = [token for token in prep.children if token.ent_type_ == "ORG"]
                print({'person': ent, 'orgs': orgs, 'past': head.tag_ == "VBD"})
    return doc

# To make the entities easier to work with, we'll merge them into single tokens
nlp.add_pipe("merge_entities")
nlp.add_pipe("extract_person_orgs")

doc = nlp("Alex Smith worked at Acme Corp Inc.")
# If you're not in a Jupyter / IPython environment, use displacy.serve
displacy.render(doc, options={"fine_grained": True})