In [1]:
from bs4 import BeautifulSoup
import spacy
from spacy import displacy
import requests
import re
import numpy as np

In [2]:
nlp = spacy.blank('en')
nlp.add_pipe('sentencizer')

<spacy.pipeline.sentencizer.Sentencizer at 0x2787612bbc0>

In [3]:
nlp.analyze_pipes()

{'summary': {'sentencizer': {'assigns': ['token.is_sent_start', 'doc.sents'],
   'requires': [],
   'scores': ['sents_f', 'sents_p', 'sents_r'],
   'retokenizes': False}},
 'problems': {'sentencizer': []},
 'attrs': {'doc.sents': {'assigns': ['sentencizer'], 'requires': []},
  'token.is_sent_start': {'assigns': ['sentencizer'], 'requires': []}}}

In [4]:
nlp2 = spacy.load('en_core_web_sm')
nlp2.analyze_pipes()

#Here, we can see a lot of pipelines compared to the blank with sentencizer

{'summary': {'tok2vec': {'assigns': ['doc.tensor'],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'tagger': {'assigns': ['token.tag'],
   'requires': [],
   'scores': ['tag_acc'],
   'retokenizes': False},
  'parser': {'assigns': ['token.dep',
    'token.head',
    'token.is_sent_start',
    'doc.sents'],
   'requires': [],
   'scores': ['dep_uas',
    'dep_las',
    'dep_las_per_type',
    'sents_p',
    'sents_r',
    'sents_f'],
   'retokenizes': False},
  'attribute_ruler': {'assigns': [],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'lemmatizer': {'assigns': ['token.lemma'],
   'requires': [],
   'scores': ['lemma_acc'],
   'retokenizes': False},
  'ner': {'assigns': ['doc.ents', 'token.ent_iob', 'token.ent_type'],
   'requires': [],
   'scores': ['ents_f', 'ents_p', 'ents_r', 'ents_per_type'],
   'retokenizes': False}},
 'problems': {'tok2vec': [],
  'tagger': [],
  'parser': [],
  'attribute_ruler': [],
  'lemmatizer': [],
  'ner': []},
 'att

# Entity Ruler

In [5]:
nlp = spacy.load('en_core_web_sm')
text = 'Rumbayaya is located in Southeast Asia, based on Mr. Deeds'
doc = nlp(text)

In [6]:
#We can add custom features of pipeline: Rule Based (Regex for date writings) and Machine Learning (People Name)
for ent in doc.ents:
    print(ent.text, ent.label_)
    
#Here you can see that Rumbaya is entitled as Person, which should have been GPE. Now, let's we try to change the ruler

Rumbayaya PERSON
Southeast Asia LOC
Deeds PERSON


In [7]:
#Add ruler
ruler = nlp.add_pipe('entity_ruler')

In [8]:
nlp.analyze_pipes()

{'summary': {'tok2vec': {'assigns': ['doc.tensor'],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'tagger': {'assigns': ['token.tag'],
   'requires': [],
   'scores': ['tag_acc'],
   'retokenizes': False},
  'parser': {'assigns': ['token.dep',
    'token.head',
    'token.is_sent_start',
    'doc.sents'],
   'requires': [],
   'scores': ['dep_uas',
    'dep_las',
    'dep_las_per_type',
    'sents_p',
    'sents_r',
    'sents_f'],
   'retokenizes': False},
  'attribute_ruler': {'assigns': [],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'lemmatizer': {'assigns': ['token.lemma'],
   'requires': [],
   'scores': ['lemma_acc'],
   'retokenizes': False},
  'ner': {'assigns': ['doc.ents', 'token.ent_iob', 'token.ent_type'],
   'requires': [],
   'scores': ['ents_f', 'ents_p', 'ents_r', 'ents_per_type'],
   'retokenizes': False},
  'entity_ruler': {'assigns': ['doc.ents', 'token.ent_type', 'token.ent_iob'],
   'requires': [],
   'scores': ['ents_f', 'ent

In [9]:
patterns = [
    {'label': 'GPE', 'pattern': 'Rumbayaya'}
]
ruler.add_patterns(patterns)

In [10]:
#The problem in here because entity ruler is located after ner. We need to change it before ner
doc2 = nlp(text)
for ent in doc2.ents:
    print(ent.text, ent.label_)

Rumbayaya PERSON
Southeast Asia LOC
Deeds PERSON


In [13]:
#We change our new ruler (entity_ruler) before the ner
nlp2 = spacy.load('en_core_web_sm')
ruler = nlp2.add_pipe('entity_ruler', before='ner')
ruler.add_patterns(patterns)

In [14]:
doc = nlp2(text)
for ent in doc.ents:
    print(ent.text, ent.label_)

Rumbayaya GPE
Southeast Asia LOC
Deeds PERSON


In [16]:
nlp2.analyze_pipes()

{'summary': {'tok2vec': {'assigns': ['doc.tensor'],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'tagger': {'assigns': ['token.tag'],
   'requires': [],
   'scores': ['tag_acc'],
   'retokenizes': False},
  'parser': {'assigns': ['token.dep',
    'token.head',
    'token.is_sent_start',
    'doc.sents'],
   'requires': [],
   'scores': ['dep_uas',
    'dep_las',
    'dep_las_per_type',
    'sents_p',
    'sents_r',
    'sents_f'],
   'retokenizes': False},
  'attribute_ruler': {'assigns': [],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'lemmatizer': {'assigns': ['token.lemma'],
   'requires': [],
   'scores': ['lemma_acc'],
   'retokenizes': False},
  'entity_ruler': {'assigns': ['doc.ents', 'token.ent_type', 'token.ent_iob'],
   'requires': [],
   'scores': ['ents_f', 'ents_p', 'ents_r', 'ents_per_type'],
   'retokenizes': False},
  'ner': {'assigns': ['doc.ents', 'token.ent_iob', 'token.ent_type'],
   'requires': [],
   'scores': ['ents_f', 'ent

In [17]:
#Now, we want to update the info about Mr. Deeds as the film
nlp3 = spacy.load('en_core_web_sm')
ruler = nlp3.add_pipe('entity_ruler', before='ner')
patterns = [
    {'label': 'GPE', 'pattern': 'Rumbayaya'},
    {'label': 'FILM', 'pattern': 'Mr. Deeds'}
]
ruler.add_patterns(patterns)

In [18]:
doc = nlp3(text)
for ent in doc.ents:
    print(ent.text, ent.label_)

Rumbayaya GPE
Southeast Asia LOC
Mr. Deeds FILM


# Matcher

In [None]:
nlp = spacy.load