## Rule-Based Matching

In [1]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [4]:
#Importing the Matcher Library from spacy
from spacy.matcher import Matcher
#create matcher object and pass nlp.vocab
#Vocab - A storage class for vocabulary and other data shared across a language in spacy
matcher = Matcher(nlp.vocab)
#Here, matcher is an object that pairs with the current vocab object
#We can add and remove specific named matchers to matcher as needed

In [6]:
#create a list and inside that list , insert a series of dictionaries as pattern or rule

# Hello World can appear in two ways :
# 1. Hello World 
# 2. Hello-world

pattern_1 = [{'LOWER' : 'hello'},{'LOWER' : 'world'}]
#Here, Pattern 1 will be able to find where 'hello' and 'world' appear together 
#And token Lowercase value be 'hello' and 'world',Hence, It will also be able
#find as 'HELLO WORLD' , 'Hello world' , 'HEllo WOrld' etc

pattern_2 = [{'LOWER' : 'hello'} , {'IS_PUNCT' : True } , {'LOWER' : 'world'}]
#Pattern 2 will be able to find match where hello and world and just separated
#by any kind of Punctuation sign as 'hello-world'

#'LOWER' , 'IS_PUNCT' are the attributes
#they have to be strictly written that way only
#There are number of attributes availiable

In [44]:
#Now, Add Pattern created to  the Matcher object

#Add a match rule to matcher , A match rule will be consiting of
# 1. An ID key
# 2. an on_match callback
# 3. one or more patterns

#Here , on_match callback is defined as none
matcher.add('Hello World',None,pattern_1,pattern_2)


In [45]:
#create a document
s1 = " 'Hello World' are the firsttwo printed word for most of the programmers , printing 'Hello-World' is most common for"
doc = nlp(s1)

In [46]:
#Finding the matches
find_matches = matcher(doc)
print(find_matches)
#it returns a list of tuples
#String ID , index start , index end

[(8585552006568828647, 2, 4), (1734270822155717711, 2, 4), (8585552006568828647, 18, 21), (1734270822155717711, 18, 21)]


In [47]:
#defining a function to get matches
for match_id,start,end in find_matches:
  string_id = nlp.vocab.strings[match_id] #to get the string representation
  span = doc[start:end]                   #to get the matched span
  print(match_id , string_id , start , end , span.text)


8585552006568828647 Hello World 2 4 Hello World
1734270822155717711 Welcome 2 4 Hello World
8585552006568828647 Hello World 18 21 Hello-World
1734270822155717711 Welcome 18 21 Hello-World


## Phrase-Based Matching 

In [12]:
#imporing the library and Loading the small spacy pre-trained model
import spacy
nlp_1 = spacy.load('en_core_web_sm')

In [13]:
#Import the phrase matcher library
from spacy.matcher import PhraseMatcher
phraseMatcher = PhraseMatcher(nlp.vocab)

In [16]:
phrase_list = ["Barack Obama" , "Angela Merkel" , "Washington D.C."]

In [24]:
#Convert each phrase to the document object
phrase_patterns = [nlp_1(text) for text in phrase_list] #using list comprehension


In [25]:
#checking the type of phrase_patterns which should be of doc form
type(phrase_patterns[0])

spacy.tokens.doc.Doc

In [27]:
#Passing each doc object into the matcher
phraseMatcher.add("TerminologyList",None , *phrase_patterns)


In [30]:
doc_3 = nlp_1("German chancellor Angela Merkel and US President Barack Obama "
              "converse in Oval office inside the white house in Washington D.C.")

In [31]:
find_matches = phraseMatcher(doc_3)
print(find_matches)

[(3766102292120407359, 2, 4), (3766102292120407359, 7, 9), (3766102292120407359, 18, 20)]


In [35]:
#defining a function to get matches
# for match_id,start,end in find_matches:
#   string_id = nlp_1.vocab.strings[match_id] #to get the string representation
#   span = doc_3[start:end]                   #to get the matched span
#   print(match_id , string_id , start , end , span.text)

KeyError: ignored