<a href="https://colab.research.google.com/github/rahiakela/natural-language-processing-research-and-practice/blob/main/text-similarity-works/09_spacy_phrase_matcher.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Setup

Reference:

[Rule-based entity recognition](https://spacy.io/usage/rule-based-matching#entityruler)

In [None]:
!pip -q install spacy
!python -m spacy download en_core_web_sm

Just restart the colab environment.

In [None]:
import pandas as pd
import numpy as np
import re
import time
import os

import pdb

import spacy
from spacy.matcher import PhraseMatcher
from spacy.lang.en import English

import nltk

##Efficient phrase matching

In [None]:
nlp = spacy.load("en_core_web_sm")
matcher = PhraseMatcher(nlp.vocab)

In [None]:
terms = ["Barack Obama", "Angela Merkel", "Washington, D.C."]

patterns = [nlp.make_doc(text) for text in terms]
matcher.add("TerminologyList", patterns)

In [None]:
doc = nlp("German Chancellor Angela Merkel and US President Barack Obama converse in the Oval Office inside the White House in Washington, D.C.")
matches = matcher(doc)

In [None]:
for match_id, start, end in matches:
  span = doc[start: end]
  print(span.text)

Angela Merkel
Barack Obama
Washington, D.C.


In [None]:
# Matched based on lowercase token
nlp = English()
matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
patterns = [nlp.make_doc(name) for name in ["Angela Merkel", "Barack Obama"]]
matcher.add("Names", patterns)

In [None]:
doc = nlp("angela merkel and us president barack Obama")

for match_id, start, end in matcher(doc):
  print("Matched based on lowercase token text:", doc[start:end])

Matched based on lowercase token text: angela merkel
Matched based on lowercase token text: barack Obama


In [None]:
nlp = English()
matcher = PhraseMatcher(nlp.vocab, attr="SHAPE")
matcher.add("IP", [nlp("127.0.0.1"), nlp("127.127.0.0")])

In [None]:
doc = nlp("Often the router will have an IP address such as 192.168.1.1 or 192.168.2.1.")
for match_id, start, end in matcher(doc):
  print("Matched based on token shape:", doc[start:end])

Matched based on token shape: 192.168.1.1
Matched based on token shape: 192.168.2.1


##Span ruler

In [None]:
nlp = spacy.blank("en")
ruler = nlp.add_pipe("span_ruler")
patterns = [
  {"label": "ORG", "pattern": "Apple"},
  {"label": "GPE", "pattern": [{"LOWER": "san"}, {"LOWER": "francisco"}]}
]
ruler.add_patterns(patterns)

In [None]:
doc = nlp("Apple is opening its first big office in San Francisco.")
print([(span.text, span.label_) for span in doc.spans["ruler"]])

[('Apple', 'ORG'), ('San Francisco', 'GPE')]


In [None]:
nlp = spacy.load("en_core_web_sm")

# only annotate doc.ents, not doc.spans
config = {"spans_key": None, "annotate_ents": True, "overwrite": False}

ruler = nlp.add_pipe("span_ruler", config=config)
patterns = [{"label": "ORG", "pattern": "MyCorp Inc."}]
ruler.add_patterns(patterns)

In [None]:
doc = nlp("MyCorp Inc. is a company in the U.S.")
print([(ent.text, ent.label_) for ent in doc.ents])

[('MyCorp Inc.', 'ORG'), ('U.S.', 'GPE')]
