In [10]:
import spacy

In [11]:
nlp = spacy.load(u"en_core_web_sm")

In [12]:
from spacy.matcher import Matcher

In [13]:
matcher = Matcher(nlp.vocab)

In [5]:
# SolarPower, solarpower, SOLARPOWER (pattern1 will check for those words)
pattern1 = [{"LOWER":'solarpower'}]

# Solar-power ((pattern2 will check for this word (including punctuation)))
pattern2 = [{"LOWER":'solar'}, {'IS_PUNCT':True}, {"LOWER":'power'}]

# Solar power ((pattern3 will check for this word (seperate word)))
pattern3 = [{'LOWER':'solar'}, {'LOWER':'power'}]

In [8]:
matcher.add('Solarpower', None, pattern1, pattern2, pattern3)

In [9]:
doc = nlp(u"The Solar Power industry continues to grow solarpower increases. Solar-power is amazing")

In [11]:
found_matches = matcher(doc)

In [12]:
print(found_matches)

[(6544436658971563323, 1, 3), (6544436658971563323, 7, 8), (6544436658971563323, 10, 13)]


In [13]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc[start:end]                    # get the matched span
    print(match_id, string_id, start, end, span.text)

6544436658971563323 Solarpower 1 3 Solar Power
6544436658971563323 Solarpower 7 8 solarpower
6544436658971563323 Solarpower 10 13 Solar-power


In [17]:
matcher.remove('Solarpower')

In [18]:
matcher

<spacy.matcher.Matcher at 0x1130556c8>

In [6]:
pattern1 = [{'LOWER':'solarpower'}]
pattern2 = [{'LOWER':'solar'}, {'IS_PUNCT':True, 'OP':'*'}, {'LOWER':'power'}]

In [21]:
# SolarPower, solarpower
pattern1 = [{"LOWER":'solarpower'}]

# solar*, solar., solar- etc
matcher.add('SolarPower', None, pattern1, pattern2)

In [22]:
doc2 = nlp(u"Solar--power is solarpower yay!")

In [23]:
found_matches = matcher(doc2)

In [24]:
print(found_matches)

[(8656102463236116519, 0, 3), (8656102463236116519, 4, 5)]


In [15]:
from spacy.matcher import PhraseMatcher

In [16]:
matcher = PhraseMatcher(nlp.vocab)

In [17]:
with open('../TextFiles/reaganomics.txt', encoding = 'unicode_escape') as f:
    doc3 = nlp(f.read())

In [18]:
# First, create a list of match phrases:
phrase_list = ['voodoo economics', 'supply-side economics', 'trickle-down economics', 'free-market economics']

# Next, convert each phrase to a Doc object:
phrase_patterns = [nlp(text) for text in phrase_list]

# Pass each Doc object into matcher (note the use of the asterisk!):
matcher.add('EconMatcher', None, *phrase_patterns)

# Build a list of matches:
found_matches = matcher(doc3)

In [19]:
phrase_patterns

[voodoo economics,
 supply-side economics,
 trickle-down economics,
 free-market economics]

In [20]:
type(phrase_patterns[0])

spacy.tokens.doc.Doc

In [21]:
found_matches = matcher(doc3)

In [22]:
found_matches

[(3680293220734633682, 41, 45),
 (3680293220734633682, 49, 53),
 (3680293220734633682, 54, 56),
 (3680293220734633682, 61, 65),
 (3680293220734633682, 673, 677),
 (3680293220734633682, 2984, 2988)]

In [51]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc3[start:end]                    # get the matched span
    print(match_id, string_id, start, end, span.text)

3680293220734633682 EconMatcher 41 45 supply-side economics
3680293220734633682 EconMatcher 49 53 trickle-down economics
3680293220734633682 EconMatcher 54 56 voodoo economics
3680293220734633682 EconMatcher 61 65 free-market economics
3680293220734633682 EconMatcher 673 677 supply-side economics
3680293220734633682 EconMatcher 2984 2988 trickle-down economics


In [52]:
## If we want more terms surrounding try this:

for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc3[start-4:end+5]               # get the matched span with more surrounding terms
    print(match_id, string_id, start, end, span.text)

3680293220734633682 EconMatcher 41 45 are commonly associated with supply-side economics, referred to as trickle
3680293220734633682 EconMatcher 49 53 , referred to as trickle-down economics or voodoo economics by political
3680293220734633682 EconMatcher 54 56 -down economics or voodoo economics by political opponents, and
3680293220734633682 EconMatcher 61 65 political opponents, and free-market economics by political advocates.


3680293220734633682 EconMatcher 673 677 a following from the supply-side economics movement, which formed in
3680293220734633682 EconMatcher 2984 2988 widely known as "trickle-down economics", due to the
