In [2]:
import spacy

In [3]:
# Import Matcher module
from spacy.matcher import Matcher

In [4]:
nlp = spacy.load('en_core_web_sm')

In [6]:
# Shared vocab
nlp.vocab

<spacy.vocab.Vocab at 0x2c6980d41c0>

In [7]:
# Initialize Matcher instance
matcher = Matcher(nlp.vocab)

In [32]:
# Add pattern to the matcher
# Pattern to match the text 'iPhone X', case sensitive
pattern = [{'TEXT': 'iPhone'}, {'TEXT': 'X'}]
pattern

[{'TEXT': 'iPhone'}, {'TEXT': 'X'}]

In [33]:
# Add the pattern with unique id IPHONE_PATTERN, no optional callback, and the specification of the pattern
matcher.add('IPHONE_PATTERN', None, pattern)

In [34]:
doc = nlp('Upcoming iPhone X release date leaked')

In [35]:
# Call the matcher on the doc
matches = matcher(doc)

In [36]:
# Iterate over the matches

for match_id, start_index, end_index in matches:
    print('Match Id:', match_id, 'Start index:', start_index, 'End index:', end_index)
    
    print('Matched span:', doc[start_index:end_index])

Match Id: 9528407286733565721 Start index: 1 End index: 3
Matched span: iPhone X


In [37]:
# 5-token pattern: number, then 3 case-insensitive words, then some punctuation mark
pattern = [
    {'IS_DIGIT': True},
    {'LOWER': 'fifa'},
    {'LOWER': 'world'},
    {'LOWER': 'cup'},
    {'IS_PUNCT': True}
]

In [38]:
doc = nlp('2018 FIFA World Cup: France won!')

In [40]:
matcher.add('FIFA', None, pattern)

for match_id, start_index, end_index in matcher(doc):
    print(doc[start_index:end_index])

2018 FIFA World Cup:


In [42]:
# 2-token pattern: some form of the lemma 'love' in verb PoS followed by a word whose PoS is noun
pattern = [
    {'LEMMA': 'love', 'POS': 'VERB'},
    {'POS': 'NOUN'}
]

matcher.add('LOVE_VERB_NOUN', None, pattern)

doc = nlp('I used to love dogs but now I have started  loving cats more.')

for match_id, start_index, end_index in matcher(doc):
    print(doc[start_index:end_index])

love dogs
loving cats


In [43]:
# 3-token pattern: some form of lemma 'buy', followed by an optional determiner (such as 'the'), followed by a noun
pattern = [
    {'LEMMA': 'buy'},
    {'POS': 'DET', 'OP': '?'},         # OP indicates match 0 or 1 time
    {'POS': 'NOUN'}
]

matcher.add('BUY_THE_NOUN', None, pattern)

doc = nlp('I bought a smartphone. Now I\'m buying apps.')

for match_id, start_index, end_index in matcher(doc):
    print(doc[start_index:end_index])

bought a smartphone
buying apps


##### Four values taken by OP:

* !   Match 0 times
* ?   Match 0 or 1 time
* \+   Match 1 or more times
* \*   Match 0 or more times

In [44]:
# Pattern to match iOS versions

pattern = [
    {'TEXT': 'iOS'},
    {'IS_DIGIT': True}
]

matcher.add('IOS_VERSION', None, pattern)

doc = nlp(
    "After making the iOS update you won't notice a radical system-wide "
    "redesign: nothing like the aesthetic upheaval we got with iOS 7. Most of "
    "iOS 11's furniture remains the same as in iOS 10. But you will discover "
    "some tweaks once you delve a little deeper."
)

for match_id, start_index, end_index in matcher(doc):
    print(doc[start_index:end_index])

iOS 7
iOS 11
iOS 10


In [47]:
# Pattern to match some form of the lemma 'download' followed by a proper noun

pattern = [
    {'LEMMA': 'downLoAd'},
    {'POS': 'PROPN'}
]

matcher.add('DOWNLOAD_PROPN', None, pattern)

doc = nlp(
    "i downloaded Fortnite on my laptop and can't open the game at all. Help? "
    "so when I was downloading Minecraft, I got the Windows version where it "
    "is the '.zip' folder and I used the default program to unpack it... do "
    "I also need to download Winzip?"
)

for match_id, start_index, end_index in matcher(doc):
    print(doc[start_index:end_index])

downloaded Fortnite
downloading Minecraft


In [48]:
for token in doc:
    print(token.text, token.pos_)

i PRON
downloaded VERB
Fortnite PROPN
on ADP
my DET
laptop NOUN
and CCONJ
ca VERB
n't PART
open VERB
the DET
game NOUN
at ADV
all ADV
. PUNCT
Help VERB
? PUNCT
so ADV
when ADV
I PRON
was AUX
downloading VERB
Minecraft PROPN
, PUNCT
I PRON
got VERB
the DET
Windows PROPN
version NOUN
where ADV
it PRON
is AUX
the DET
' PUNCT
.zip PUNCT
' PUNCT
folder ADV
and CCONJ
I PRON
used VERB
the DET
default NOUN
program NOUN
to PART
unpack VERB
it PRON
... PUNCT
do AUX
I PRON
also ADV
need VERB
to PART
download VERB
Winzip NOUN
? PUNCT


Winzip is detected as a noun, so not caught by the pattern.

In [49]:
# Pattern that matches an adjectuve followed by one or two nouns
# i.e. the second noun is optional

pattern = [
    {'POS': 'ADJ'},
    {'POS': 'NOUN'},
    {'POS': 'NOUN', 'OP': '?'}
]

matcher.add('ADJ_NOUN_OPT_NOUN', None, pattern)

doc = nlp(
    "Features of the app include a beautiful design, smart search, automatic "
    "labels and optional voice responses."
)

for match_id, start_index, end_index in matcher(doc):
    print(doc[start_index:end_index])

beautiful design
smart search
automatic labels
optional voice
optional voice responses
