In [34]:
import spacy
from spacy.matcher import Matcher
from spacy.matcher import PhraseMatcher

In [35]:
# Use matcher to detect rule-based patterns

nlp = spacy.load('en_core_web_sm')
matcher = Matcher(nlp.vocab)
matcher

<spacy.matcher.matcher.Matcher at 0x2128368d1c0>

In [36]:
# Pattern to detect a verb formm of lemma 'love' followed by the word 'cats' in any case
pattern1 = [
    {'LEMMA': 'love', 'POS': 'VERB'},
    {'LOWER': 'cats'}
]
matcher.add('LOVE_CATS', None, pattern1)

In [37]:
# Patternt to one or more instances of 'very' followed by 'happy'
pattern2 = [
    {'TEXT': 'very', 'OP': '+'},
    {'TEXT': 'happy'}
]
matcher.add('VERRY_HAPPY', None, pattern2)

In [38]:
doc = nlp('I love cats and I\'m very happy')
matches = matcher(doc)
matches

[(9137535031263442622, 1, 3), (2086246540823757615, 6, 8)]

In [39]:
for match_id, start_index, end_index in matches:
    print(doc[start_index:end_index])

love cats
very happy


In [40]:
# Case-onsensitive mach pattern for 'golden retriever'
pattern = [
    {'LOWER': 'golden'},
    {'LOWER': 'retriever'}
]
matcher.add('GOLDEN_RETRIEVER', None, pattern)

doc = nlp('I have a Golden Retriever.')
matches = matcher(doc)
matches

[(15887489236585457779, 3, 5)]

In [41]:
# Find out more about the span obtained from the match

for match_id, start, end in matches:
    span = doc[start:end]
    print('Matched span:', span.text)
    
    # Root token of the span
    # i.e. when more than one token, the token that decides the category of the phrase
    print('Root token:', span.root.text)
    
    # Head of the root token
    # i.e. syantactic parent that governs the phrase
    print('Root hread token:', span.root.head.text)
    
    # PoS information of the previous token before the phrase
    print('Previous token:', doc[start-1].text, 'PoS:', doc[start-1].pos_)
    
    # PoS information of the next token after the phrase
    print('Next token:', doc[end].text, 'PoS:', doc[end].pos_)

Matched span: Golden Retriever
Root token: Retriever
Root hread token: have
Previous token: a PoS: DET
Next token: . PoS: PUNCT


In [42]:
# Phrase Matcher is a more efficient tool to find squences of words, esp for matching large words lists
# It is faster and has direct access to tokens

# Instantiate
matcher = PhraseMatcher(nlp.vocab)
matcher

<spacy.matcher.phrasematcher.PhraseMatcher at 0x212857589e0>

In [47]:
# Instead of a list of dictionaries, pass a Doc bject as the pattern
pattern = nlp('Golden Retriever')
matcher.add('DOG', None, pattern)

doc = nlp('I hava a Golden Retriever')
matches = matcher(doc)
matches

[(2951553348639939143, 3, 5)]

In [48]:
# Iterate over the matches
for match_id, start, end in matches:
    span = doc[start:end]
    print('Matched span:', span)

Matched span: Golden Retriever


In [50]:
matcher = Matcher(nlp.vocab)
pattern1 = [
    {'LOWER': 'amazon'},
    {'IS_TITLE': True, 'POS': 'PROPN'}
]

pattern2 = [
    {'LOWER': 'ad'},
    {'TEXT': '-'},
    {'LOWER': 'free'}
]

doc = nlp(
    "Twitch Prime, the perks program for Amazon Prime members offering free "
    "loot, games and other benefits, is ditching one of its best features: "
    "ad-free viewing. According to an email sent out to Amazon Prime members "
    "today, ad-free viewing will no longer be included as a part of Twitch "
    "Prime for new members, beginning on September 14. However, members with "
    "existing annual subscriptions will be able to continue to enjoy ad-free "
    "viewing until their subscription comes up for renewal. Those with "
    "monthly subscriptions will have access to ad-free viewing until October 15."
)

matcher.add('AMAZON_PROPN', None, pattern1)
matcher.add('AD_FREE', None, pattern2)

matches = matcher(doc)
matches

[(3859178723802017889, 7, 9),
 (6462165356620187099, 27, 30),
 (3859178723802017889, 39, 41),
 (6462165356620187099, 44, 47),
 (6462165356620187099, 82, 85),
 (6462165356620187099, 102, 105)]

In [51]:
for pattern_id, start, end in matches:
    print(doc[start:end])

Amazon Prime
ad-free
Amazon Prime
ad-free
ad-free
ad-free
