<a href="https://colab.research.google.com/github/seanreed1111/colab-demos/blob/master/rules_based_spacy_with_entity_ruler_and_spans.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

- https://github.com/AnushaDeviR/nlp_spacy_basics/tree/main

# Rules-based Spacy

### Using SpaCy's EntityRuler

- 2 different ways to add custom features to language-based pipelines:
  1. rules-based
  1. machine learning based approach
  
- Rule-based approach is taken when a set of rules can be generated using a list of known things or rules generated from regex or linguistic features (used to recoginize dates)
- ML-based approach is taken when we don't know the rules or are complicated (used in entity recoginition of names)

In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm')
# our code should be able to extract `harry potter` as a movie and 'Azkaban' as a place
text = 'Azkaban was referenced in Harry Potter.'

In [3]:
doc = nlp(text)
for ent in doc.ents:
    print(ent.text, ent.label_)

Azkaban ORG
Harry Potter PERSON


In [4]:
ruler = nlp.add_pipe('entity_ruler')

In [5]:
nlp.analyze_pipes()

{'summary': {'tok2vec': {'assigns': ['doc.tensor'],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'tagger': {'assigns': ['token.tag'],
   'requires': [],
   'scores': ['tag_acc'],
   'retokenizes': False},
  'parser': {'assigns': ['token.dep',
    'token.head',
    'token.is_sent_start',
    'doc.sents'],
   'requires': [],
   'scores': ['dep_uas',
    'dep_las',
    'dep_las_per_type',
    'sents_p',
    'sents_r',
    'sents_f'],
   'retokenizes': False},
  'attribute_ruler': {'assigns': [],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'lemmatizer': {'assigns': ['token.lemma'],
   'requires': [],
   'scores': ['lemma_acc'],
   'retokenizes': False},
  'ner': {'assigns': ['doc.ents', 'token.ent_iob', 'token.ent_type'],
   'requires': [],
   'scores': ['ents_f', 'ents_p', 'ents_r', 'ents_per_type'],
   'retokenizes': False},
  'entity_ruler': {'assigns': ['doc.ents', 'token.ent_type', 'token.ent_iob'],
   'requires': [],
   'scores': ['ents_f', 'ent

In [7]:
# adding patterns in the pipelines
patterns = [
    {'label': 'GPE', 'pattern': 'Azkaban'}
]
ruler.add_patterns(patterns)

In [8]:
doc2 = nlp(text)
for ent in doc2.ents:
    print(ent.text, ent.label_) # IT DIDN'T WORK!!!

Azkaban ORG
Harry Potter PERSON



The `ner` should be after the `entity_ruler` pipe in order for `Azkaban` to be categorized as `GPE`.

In [9]:
nlp2 = spacy.load('en_core_web_sm')

In [10]:
# to add the entity_ruler before ner, the parameter `before` is sent
ruler = nlp2.add_pipe('entity_ruler', before='ner')
patterns = [
    {'label': 'GPE', 'pattern': 'Azkaban'}
]
ruler.add_patterns(patterns)

In [12]:
doc = nlp2(text)

for ent in doc.ents:
    print(ent.text, ent.label_) # works!!

Azkaban GPE
Harry Potter PERSON


In [13]:
nlp2.analyze_pipes()

{'summary': {'tok2vec': {'assigns': ['doc.tensor'],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'tagger': {'assigns': ['token.tag'],
   'requires': [],
   'scores': ['tag_acc'],
   'retokenizes': False},
  'parser': {'assigns': ['token.dep',
    'token.head',
    'token.is_sent_start',
    'doc.sents'],
   'requires': [],
   'scores': ['dep_uas',
    'dep_las',
    'dep_las_per_type',
    'sents_p',
    'sents_r',
    'sents_f'],
   'retokenizes': False},
  'attribute_ruler': {'assigns': [],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'lemmatizer': {'assigns': ['token.lemma'],
   'requires': [],
   'scores': ['lemma_acc'],
   'retokenizes': False},
  'entity_ruler': {'assigns': ['doc.ents', 'token.ent_type', 'token.ent_iob'],
   'requires': [],
   'scores': ['ents_f', 'ents_p', 'ents_r', 'ents_per_type'],
   'retokenizes': False},
  'ner': {'assigns': ['doc.ents', 'token.ent_iob', 'token.ent_type'],
   'requires': [],
   'scores': ['ents_f', 'ent

In [14]:
from spacy import displacy
displacy.render(doc, style='ent', jupyter=True)

In [15]:
displacy.render(doc, style='dep', jupyter=True)

# adding `film` as a custom label

In [24]:
# adding `film` as a custom label

nlp3 = spacy.load('en_core_web_sm')
ruler = nlp3.add_pipe('entity_ruler', before = 'ner')
nlp3.pipe_names

['tok2vec',
 'tagger',
 'parser',
 'attribute_ruler',
 'lemmatizer',
 'entity_ruler',
 'ner']

In [25]:
nlp3.disable_pipe("lemmatizer")
nlp3.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'entity_ruler', 'ner']

In [None]:
patterns = [
    {'label': 'GPE', 'pattern': 'Azkaban'},
    {'label': 'FILM', 'pattern': 'Harry Potter'}
]

In [26]:
ruler.add_patterns(patterns)
doc = nlp3(text)
for ent in doc.ents:
    print(ent.text, ent.label_)

Azkaban GPE
Harry Potter PERSON


`NOTE`: But identifying `Harry Potter` as a film could cause a clash when a person is actually named as `Harry Potter`, due to our model recoginizing it as a film. This is called a `toponym`. A toponym resolution refers to when a word can have multiple label that are dependent upon context.

# Spacy with RegEx for multi-word tokens

In [32]:
import re
from spacy.tokens import Span

text = "Paul Newman was an American actor, but Paul Hollywood is a British TV Host. The name Paul is quite common."

pattern = f"Paul [A-Z]\w+"

matches = re.finditer(pattern, text)
for match in matches:
    print(match)

<re.Match object; span=(0, 11), match='Paul Newman'>
<re.Match object; span=(39, 53), match='Paul Hollywood'>


## If you want to add multi-word tokens, you have to search through the do for ALL of them if you use Span class to add them to  `doc.ents`

In [39]:
text = "Paul Newman was an American actor, but Paul Hollywood is a British TV Host. The name Paul is quite common."
pattern = f"Paul [A-Z]\w+"

nlp = spacy.blank('en')
doc = nlp(text)
original_ents = list(doc.ents)
print("tokens>", [tok for tok in doc])
print("original entities>", [ent for ent in original_ents])
mwt_ents = []

for match in re.finditer(pattern, doc.text):
    start, end = match.span()
    span = doc.char_span(start, end)

    if span is not None:
      mwt_ents.append((span.start, span.end, span.text))

print("new entities>", [ent for ent in mwt_ents])

tokens> [Paul, Newman, was, an, American, actor, ,, but, Paul, Hollywood, is, a, British, TV, Host, ., The, name, Paul, is, quite, common, .]
original entities> []
new entities> [(0, 2, 'Paul Newman'), (8, 10, 'Paul Hollywood')]


In [40]:
for ent in mwt_ents:
    start, end, _ = ent
    person_ent = Span(doc, start, end, label = 'PERSON')
    original_ents.append(person_ent)

doc.ents = original_ents

for ent in doc.ents:
    print(ent.text, ent.label_)

Paul Newman PERSON
Paul Hollywood PERSON


## Do all this in a custom pipeline

In [42]:
# create a custom pipe
from spacy.language import Language
import re

@Language.component('paul_ner')
def paul_ner(doc):
    pattern = f"Paul [A-Z]\w+"
    original_ents = list(doc.ents)
    mwt_ents = []

    for match in re.finditer(pattern, doc.text):
        start, end = match.span()
        span = doc.char_span(start, end)

        if span is not None:
            mwt_ents.append((span.start, span.end, span.text))

    for ent in mwt_ents:
        start, end, _ = ent
        person_ent = Span(doc, start, end, label = 'PERSON')
        original_ents.append(person_ent)

    doc.ents = original_ents

    return(doc)

In [44]:
nlp2 = spacy.blank('en')
nlp2.add_pipe('paul_ner')
doc2 = nlp2(text)
for ent in doc2.ents:
    print(ent.text, ent.label_)

Paul Newman PERSON
Paul Hollywood PERSON
