In [1]:
# Import spaCy
import spacy

# Load the small English pipeline
nlp = spacy.load("en_core_web_sm")

# Rule-based matching

***Each object inside the pattern represents a token***

match "iPhone X" and all verbs

In [2]:
import spacy

nlp = spacy.load("en_core_web_sm")

text = "Upcoming iPhone X release date leaked as Apple reveals pre-orders"

# Process the text
doc = nlp(text)

from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)

pattern1 = [{'TEXT':'iPhone'},{'TEXT':'X'}]
matcher.add('iphonx_pattern', [pattern1])

pattern2 = [{'POS':'VERB'}]
matcher.add('verb_pattern', [pattern2])

matches = matcher(doc)

print('number of matches: ',len(matches))

for id, start_index, end_index in matches:
    print(id, start_index, end_index,' -> ', doc[start_index:end_index])

number of matches:  4
1598491564677789804 0 1  ->  Upcoming
6584259252511850000 1 3  ->  iPhone X
1598491564677789804 5 6  ->  leaked
1598491564677789804 8 9  ->  reveals


In [3]:
doc = nlp("2018 FIFA World Cup: France won!")

pattern = [
    {"IS_DIGIT": True},
    {"LOWER": "fifa"},
    {"LOWER": "world"},
    {"LOWER": "cup"},
    {"IS_PUNCT": True}
]

matcher = Matcher(nlp.vocab)
matcher.add('fifa_world_cup',[pattern])
matches = matcher(doc)

print('number of matches: ',len(matches))

for id, start_index, end_index in matches:
    print(id, start_index, end_index,' -> ', doc[start_index:end_index])

number of matches:  1
11837971121878756899 0 5  ->  2018 FIFA World Cup:


# pattern matcher: operators and quantifiers 

***You can define more characteristics of a token***

In [4]:
pattern = [
    {"LEMMA": "buy"},
    {"POS": "DET", "OP": "?"},  # optional: match 0 or 1 times
    {"POS": "NOUN"}
]
doc = nlp("I bought a smartphone. Now I'm buying apps.")
matcher = Matcher(nlp.vocab)
matcher.add('buy_sth',[pattern])
matches = matcher(doc)

print('number of matches: ',len(matches))

for id, start_index, end_index in matches:
    print(id, start_index, end_index,' -> ', doc[start_index:end_index])


number of matches:  2
16299021794684283811 1 4  ->  bought a smartphone
16299021794684283811 8 10  ->  buying apps


{"OP": "!"}	Negation: match 0 times

{"OP": "?"}	Optional: match 0 or 1 times

{"OP": "+"}	Match 1 or more times

{"OP": "*"}	Match 0 or more times

### Example :

Write one pattern that matches adjectives ("ADJ") followed by one or two "NOUN"s (one noun and one optional noun).

In [5]:
import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)

doc = nlp(
    "Features of the app include a beautiful design, smart search, automatic "
    "labels and optional voice responses."
)

# Write a pattern for adjective plus one or two nouns
pattern = [{"POS": 'ADJ'}, {"POS": 'NOUN'}, {"POS": 'NOUN', "OP": '?'}]

# Add the pattern to the matcher and apply the matcher to the doc
matcher.add("ADJ_NOUN_PATTERN", [pattern])
matches = matcher(doc)
print("Total matches found:", len(matches))

# Iterate over the matches and print the span text
for match_id, start, end in matches:
    print("Match found:", doc[start:end].text)

Total matches found: 5
Match found: beautiful design
Match found: smart search
Match found: automatic labels
Match found: optional voice
Match found: optional voice responses
