# Rule based matching using spacy matcher
    Compared with regular expression the matcher works on doc objects instead of only string
    We can match on tokens and token attributes
    We can write rules that use the models predictions
    Example "duck" (verb) vs "duck" (noun)

In [2]:
import spacy
#import the matcher

from spacy.matcher import Matcher

# Load the model and create nlp object
nlp = spacy.load('en_core_web_sm')

# initialize the matcher with shared vocab
matcher = Matcher(nlp.vocab)

# add pattern
pattern = [{'ORTH':'iPhone'}, {'ORTH':'X'}]
matcher.add('IPhone', [pattern])

#process the text
doc = nlp("New iPhone X release date has been leaked.")

# call the matcher
match = matcher(doc)

for match_id, start, end in match:
    # match_id - hash value of the pattern
    # start - start index of the matched span
    # end - end index of the matched span
    print(doc[start:end])

iPhone X


In [3]:
# Match the lexical attributes

matcher = Matcher(nlp.vocab)

pattern = [{'POS':'NOUN'}]

matcher.add('Noun', [pattern])

doc = nlp('Cat is the cutest animal in the planet.')

match  = matcher(doc)

for match_id, start, end in match:
    print(match_id, doc[start: end].text)

1882071534088494249 animal
1882071534088494249 planet


In [13]:
doc = nlp("FIFA has won the 2018 Fifa World Cup!")
pattern = [{'IS_DIGIT': True}, {'LOWER': 'fifa'}, {'LOWER': 'world'}]
matcher = Matcher(nlp.vocab)
matcher.add('FIFA', [pattern])
match = matcher(doc)

for match_id, start, end in match:
    print(doc[start: end].text)

2018 Fifa World


In [15]:
# match using operators and quantifiers

doc = nlp('I bought a smartphone and now I am buying the apps')
matcher = Matcher(nlp.vocab)

pattern = [{'LEMMA': 'buy'}, {'POS': 'DET', 'OP': '?'}]

matcher.add('LEMMA', [pattern])

match = matcher(doc)

for match_id, start, end in match:
    print(doc[start: end].text)

bought
bought a
buying
buying the


In [27]:
doc = nlp("I downloaded Fortnite on my laptop and can't open the game at all. Help? so when I was downloading\
           Minecraft, I got the Windows version where it is the '.zip' folder and I used the default program to unpack\
           it... do I also need to download Winzip?")

pattern = [{'LEMMA': 'download'}, {'POS': 'PROPN', 'OP':'?'}, {'POS': 'NOUN', 'OP':'?'}]
matcher = Matcher(nlp.vocab)
matcher.add('Test', [pattern])
match = matcher(doc)

for match_id, start, end in match:
    print(doc[start: end].text)

downloaded
downloaded Fortnite
downloading
download
download Winzip


In [7]:
doc = nlp("Features of the app include a beautiful design, smart search, automatic labels and optional voice responses.")
# write a pattern for adjective plus one or two nouns

matcher = Matcher(nlp.vocab)
pattern = [{'POS':'ADJ'}, {'POS': 'NOUN'}, {'POS': 'NOUN', 'OP':'?'}]
matcher.add('Nouns', [pattern])

match = matcher(doc)
for match_id, start, end in match:
    print(doc[start:end])

beautiful design
smart search
automatic labels
optional voice
optional voice responses


### Phrase Matcher

In [8]:
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)

pattern = nlp('Golden Retriver')
matcher.add('Dog', [pattern])

doc = nlp('I have a Golden Retriver')

match = matcher(doc)

for match_id, start, end in match:
    print(doc[start:end])

Golden Retriver


In [13]:
COUNTRIES = ['Afghanistan','Albania','Algeria','Andorra','Angola','Antigua and Barbuda','Argentina','Armenia','Australia','Austria','Azerbaijan','Bahamas','Bahrain','Bangladesh','Barbados','Belarus','Belgium','Belize','Benin','Bhutan','Bolivia','Bosnia and Herzegovina','Botswana','Brazil','Brunei','Bulgaria','Burkina Faso','Burundi',"Côte d'Ivoire",'Cabo Verde','Cambodia','Cameroon','Canada','Central African Republic','Chad','Chile','China','Colombia','Comoros','Congo (Congo-Brazzaville)','Costa Rica','Croatia','Cuba','Cyprus','Czech Republic','Democratic Republic of the Congo','Denmark','Djibouti','Dominica','Dominican Republic','Ecuador','Egypt','El Salvador','Equatorial Guinea','Eritrea','Estonia','Eswatini (fmr. "Swaziland")','Ethiopia','Fiji','Finland','France','Gabon','Gambia','Georgia','Germany','Ghana','Greece','Grenada','Guatemala','Guinea','Guinea-Bissau','Guyana','Haiti','Holy See','Honduras','Hungary','Iceland','India','Indonesia','Iran','Iraq','Ireland','Israel','Italy','Jamaica','Japan','Jordan','Kazakhstan','Kenya','Kiribati','Kuwait','Kyrgyzstan','Laos','Latvia','Lebanon','Lesotho','Liberia','Libya','Liechtenstein','Lithuania','Luxembourg','Madagascar','Malawi','Malaysia','Maldives','Mali','Malta','Marshall Islands','Mauritania','Mauritius','Mexico','Micronesia','Moldova','Monaco','Mongolia','Montenegro','Morocco','Mozambique','Myanmar (formerly Burma)','Namibia','Nauru','Nepal','Netherlands','New Zealand','Nicaragua','Niger','Nigeria','North Korea','North Macedonia','Norway','Oman','Pakistan','Palau','Palestine State','Panama','Papua New Guinea','Paraguay','Peru','Philippines','Poland','Portugal','Qatar','Romania','Russia','Rwanda','Saint Kitts and Nevis','Saint Lucia','Saint Vincent and the Grenadines','Samoa','San Marino','Sao Tome and Principe','Saudi Arabia','Senegal','Serbia','Seychelles','Sierra Leone','Singapore','Slovakia','Slovenia','Solomon Islands','Somalia','South Africa','South Korea','South Sudan','Spain','Sri Lanka','Sudan','Suriname','Sweden','Switzerland','Syria','Tajikistan','Tanzania','Thailand','Timor-Leste','Togo','Tonga','Trinidad and Tobago','Tunisia','Turkey','Turkmenistan','Tuvalu','Uganda','Ukraine','United Arab Emirates','United Kingdom','United States of America','Uruguay','Uzbekistan','Vanuatu','Venezuela','Vietnam','Yemen','Zambia','Zimbabwe']

In [50]:
matcher = PhraseMatcher(nlp.vocab)

doc = nlp("Czech Republic may help Slovaks protect its regime from Russia")
pattern = list(nlp.pipe(COUNTRIES))
matcher.add('Country', pattern)

match = matcher(doc)
print([doc[start:end] for match_id, start, end in match])

[Czech Republic, Russia]


### Comparing Semantic Similarity

In [18]:
nlp = spacy.load('en_core_web_md')

In [23]:
doc_king = nlp('I like King')
doc_queen = nlp("I like Queen")

doc_king.similarity(doc_queen)

0.9297710649238541

In [26]:
doc_men = nlp('I like Men')
doc_women = nlp("I like Women")

doc_men.similarity(doc_women)

0.7687319998878382

In [27]:
doc_men[2].vector

array([ 1.0410e-02, -1.1339e-02,  3.2865e-01, -3.8009e-01, -3.2345e-01,
        5.4006e-02,  5.3922e-01,  3.2480e-01,  5.8872e-01,  3.1140e+00,
       -3.4070e-01, -3.5357e-01,  1.6412e-01, -1.3405e-01,  2.5613e-01,
       -4.5910e-01, -2.9710e-01,  8.7180e-01, -4.5571e-03,  5.2537e-01,
       -1.3515e-01, -6.3228e-01, -1.4012e-01, -1.6667e-01, -1.8224e-01,
        9.4281e-02,  1.7619e-01,  2.7576e-01,  5.1345e-01,  3.4117e-01,
       -3.9954e-01,  6.7538e-01, -1.0417e+00, -5.3799e-02,  2.0235e-01,
       -7.4971e-01,  1.7682e-01, -7.2984e-01, -9.5704e-02,  2.9547e-01,
       -6.5500e-01, -7.5561e-02,  3.5961e-01, -4.1948e-01,  2.3379e-01,
       -3.3119e-01,  8.5873e-02,  1.1059e-01,  2.8848e-01, -2.1663e-02,
       -6.8974e-01, -3.8078e-01,  6.3097e-02, -1.9700e-01,  5.2430e-01,
        4.2529e-01,  1.1217e-01, -2.3269e-01, -4.3879e-01, -1.0308e-01,
       -2.1555e-02, -1.2585e-01, -4.8075e-02,  2.9648e-01, -1.6962e-01,
        4.4210e-01,  1.7999e-01,  1.0938e-01, -2.6039e-01,  2.01

In [39]:
doc1 = nlp('Sun')
doc2 = nlp("notebook")

doc1.similarity(doc2)

0.15040412922563912

In [40]:
# Similarly we can do this for token and span

In [46]:
def get_distance(vector1, vector2):
    return np.linalg.norm(vector1 - vector2)

In [47]:
import numpy as np
king = nlp('KING')
queen = nlp('QUEEN')

get_distance(king[0].vector, queen[0].vector)

5.1864085

In [48]:
man = nlp('MAN')
woman = nlp('WOMAN')

get_distance(man[0].vector, woman[0].vector)

4.8034196

In [51]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

### CUSTOM PIPELINES

In [63]:
from spacy import Language
nlp = spacy.load('en_core_web_sm')
@Language.component('CustomComponent')
def custom_component(doc):
    print('Doc Length: ', len(doc))
    return doc

nlp.add_pipe('CustomComponent', first=True)

print('Pipeline :', nlp.pipe_names)

Pipeline : ['CustomComponent', 'tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']


In [64]:
doc = nlp('Hello World!')

Doc Length:  3


In [83]:
from spacy.tokens.span import Span

nlp = spacy.load('en_core_web_sm')
matcher = PhraseMatcher(nlp.vocab)
pattern = list(nlp.pipe(['cat', 'Golden Retriever']))
matcher.add('Animals', pattern)
# Create more complex component using Phrase Matcher to find animals
@Language.component('AnimalComponent')
def animal_component(doc):
    # create a span for match and assign the Label 'ANIMAL'
    # overwrite doc.ents with matched spans
    doc.ents = [Span(doc, start, end, label='ANIMAL') for match_is, start, end in matcher(doc)]
    return doc

# add the animal component to the pipeline after the ner component

nlp.add_pipe('AnimalComponent', after='ner')

doc = nlp('I have a cat and a Golden Retriever')

#print([ent,text, ent, ent.label_] for ent in doc.ents)

for ent in doc.ents:
    print(ent.text, ent, ent.label_)

cat cat ANIMAL
Golden Retriever Golden Retriever ANIMAL
