## Python for NLP: Vocabulary and Phrase Matching with SpaCy
https://stackabuse.com/python-for-nlp-vocabulary-and-phrase-matching-with-spacy/

### Creating Matcher Object

In [7]:
import spacy
nlp = spacy.load('en_core_web_sm')

from spacy.matcher import Matcher
m_tool = Matcher(nlp.vocab)

### Defining Patterns

In [8]:
p1 = [{'LOWER': 'quickbrownfox'}]
p2 = [{'LOWER': 'quick'}, {'IS_PUNCT': True}, {'LOWER': 'brown'}, {'IS_PUNCT': True}, {'LOWER': 'fox'}]
p3 = [{'LOWER': 'quick'}, {'LOWER': 'brown'}, {'LOWER': 'fox'}]
p4 =  [{'LOWER': 'quick'}, {'LOWER': 'brownfox'}]

* p1 looks for the phrase "quickbrownfox"
* p2 looks for the phrase "quick-brown-fox"
* p3 tries to search for "qucik brown fox"
* p4 looks for the phrase "quick brownfox"

In [12]:
# Once the patterns are defined, we need to add them to the Matcher object that we created earlier.
m_tool.add('QBF', None, p1, p2, p3, p4)
# Here "QBF" is the name of our matcher. You can give it any name.

### Applying Matcher to the Document

In [14]:
sentence = nlp(u'The quick-brown-fox jumps over the lazy dog. The quick brown fox eats well. \
               the quickbrownfox is dead. the dog misses the quick brownfox')


In [15]:
phrase_matches = m_tool(sentence)
print(phrase_matches )

[(12825528024649263697, 1, 6), (12825528024649263697, 13, 16), (12825528024649263697, 21, 22), (12825528024649263697, 29, 31)]


In [16]:
for match_id, start, end in phrase_matches:
    string_id = nlp.vocab.strings[match_id]  
    span = sentence[start:end]                   
    print(match_id, string_id, start, end, span.text)

12825528024649263697 QBF 1 6 quick-brown-fox
12825528024649263697 QBF 13 16 quick brown fox
12825528024649263697 QBF 21 22 quickbrownfox
12825528024649263697 QBF 29 31 quick brownfox


In [17]:
m_tool.remove('QBF')

In [18]:
p1 = [{'LOWER': 'quick'}, {'IS_PUNCT': True, 'OP':'*'}, {'LOWER': 'brown'}, {'IS_PUNCT': True, 'OP':'*'}, {'LOWER': 'fox'}]
m_tool.add('QBF', None, p1)

In [19]:
sentence = nlp(u'The quick--brown--fox jumps over the  quick-brown---fox')


In [20]:
phrase_matches = m_tool(sentence)

for match_id, start, end in phrase_matches:
    string_id = nlp.vocab.strings[match_id]  
    span = sentence[start:end]                   
    print(match_id, string_id, start, end, span.text)

12825528024649263697 QBF 1 6 quick--brown--fox
12825528024649263697 QBF 10 15 quick-brown---fox


## Phrase-Based Matching


In [22]:
import bs4 as bs  
import urllib.request  
import re  
import nltk

scrapped_data = urllib.request.urlopen('https://en.wikipedia.org/wiki/Artificial_intelligence')  
article = scrapped_data .read()

parsed_article = bs.BeautifulSoup(article,'lxml')

paragraphs = parsed_article.find_all('p')

article_text = ""

for p in paragraphs:  
    article_text += p.text
    
    
processed_article = article_text.lower()  
processed_article = re.sub('[^a-zA-Z]', ' ', processed_article )  
processed_article = re.sub(r'\s+', ' ', processed_article)

### Create Phrase Matcher Object

In [24]:
import spacy
nlp = spacy.load('en_core_web_sm')


from spacy.matcher import PhraseMatcher
phrase_matcher = PhraseMatcher(nlp.vocab)

### Create Phrase List

In [26]:
phrases = ['machine learning', 'robots', 'intelligent agents']

patterns = [nlp(text) for text in phrases]

In [27]:
phrase_matcher.add('AI', None, *patterns)

### Applying Matcher to the Document

In [30]:
sentence = nlp (processed_article)

matched_phrases = phrase_matcher(sentence)
matched_phrases

[(5530044837203964789, 35, 37),
 (5530044837203964789, 350, 352),
 (5530044837203964789, 641, 642),
 (5530044837203964789, 1233, 1235),
 (5530044837203964789, 1548, 1550),
 (5530044837203964789, 3094, 3096),
 (5530044837203964789, 3253, 3255),
 (5530044837203964789, 3793, 3794),
 (5530044837203964789, 5251, 5252),
 (5530044837203964789, 5328, 5329),
 (5530044837203964789, 6816, 6818),
 (5530044837203964789, 6828, 6830),
 (5530044837203964789, 7550, 7552),
 (5530044837203964789, 7689, 7691),
 (5530044837203964789, 8056, 8058),
 (5530044837203964789, 9531, 9532),
 (5530044837203964789, 9596, 9597),
 (5530044837203964789, 9851, 9853),
 (5530044837203964789, 10203, 10205),
 (5530044837203964789, 11231, 11232),
 (5530044837203964789, 11697, 11698),
 (5530044837203964789, 12711, 12712),
 (5530044837203964789, 12822, 12823),
 (5530044837203964789, 12943, 12944),
 (5530044837203964789, 12987, 12988)]

In [31]:
for match_id, start, end in matched_phrases:
    string_id = nlp.vocab.strings[match_id]  
    span = sentence[start:end]                   
    print(match_id, string_id, start, end, span.text)

5530044837203964789 AI 35 37 intelligent agents
5530044837203964789 AI 350 352 machine learning
5530044837203964789 AI 641 642 robots
5530044837203964789 AI 1233 1235 machine learning
5530044837203964789 AI 1548 1550 intelligent agents
5530044837203964789 AI 3094 3096 intelligent agents
5530044837203964789 AI 3253 3255 machine learning
5530044837203964789 AI 3793 3794 robots
5530044837203964789 AI 5251 5252 robots
5530044837203964789 AI 5328 5329 robots
5530044837203964789 AI 6816 6818 machine learning
5530044837203964789 AI 6828 6830 machine learning
5530044837203964789 AI 7550 7552 machine learning
5530044837203964789 AI 7689 7691 machine learning
5530044837203964789 AI 8056 8058 machine learning
5530044837203964789 AI 9531 9532 robots
5530044837203964789 AI 9596 9597 robots
5530044837203964789 AI 9851 9853 machine learning
5530044837203964789 AI 10203 10205 machine learning
5530044837203964789 AI 11231 11232 robots
5530044837203964789 AI 11697 11698 robots
5530044837203964789 AI 127

## Stop Words

In [33]:
import spacy
sp = spacy.load('en_core_web_sm')
print(sp.Defaults.stop_words)

{'somehow', 'though', 'that', 'above', 'below', 'name', 'against', 'everywhere', 'few', 'part', 'someone', 'down', 'has', 'have', 'why', 'themselves', 'give', 'due', 'us', 'is', 'others', 'show', 'while', 'from', 'we', 'thereby', 'twelve', 'another', 'or', '’d', 'amongst', 'either', 'own', 'a', 'sometimes', 'whereby', 'many', "'m", 'became', 'besides', 'most', 'therein', 'seemed', 'ours', 'some', 'made', 'every', 'whoever', 'back', 'becomes', 'forty', 'he', 'once', 'no', 'take', 'across', 'therefore', 'which', 'latter', 'where', "'ve", 'whom', 'about', 'hereafter', 'except', 'last', 'and', 'through', 'onto', 'she', 'somewhere', 'whereas', 'not', '’ll', 'whole', 'me', 'next', 'whither', 'am', 'anywhere', 'during', 'front', 'my', 'mostly', 'i', 'four', 'mine', 'seeming', 'never', 'seems', 'thru', 'whence', 'with', 'everyone', 'although', 'eight', 'nor', 'her', 'these', 'throughout', 'what', 'amount', 'per', 'who', 'such', 'go', 'can', 'move', 'being', 'else', 'get', 'other', 'eleven', 'w

In [34]:
sp.vocab['wonder'].is_stop

False

In [35]:
sp.Defaults.stop_words.add('wonder')

In [36]:
sp.vocab['wonder'].is_stop = True

In [37]:
sp.vocab['wonder'].is_stop

True