In [1]:
import spacy
from spacy.matcher import Matcher
from spacy.tokens import Span
from spacy import displacy

In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
pattern = [{"LOWER": "hello"}, {"IS_PUNCT": True, 'OP':'?'}, {"LOWER": "world"}]

In [4]:
matcher = Matcher(nlp.vocab)
matcher.add("HelloWorld", [pattern])

In [5]:
doc = nlp("Hello, world! Hello world!")
matches = matcher(doc)

In [6]:
matches

[(15578876784678163569, 0, 3), (15578876784678163569, 4, 6)]

In [7]:
for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id]
    span = doc[start:end]
    print(match_id, string_id, start, end, span.text)

15578876784678163569 HelloWorld 0 3 Hello, world
15578876784678163569 HelloWorld 4 6 Hello world


## Regular expression using re

In [8]:
text = "My phone no is 789, ho sorry, it is 1234567890, please call me."

In [9]:
import re

re.search(r'\d{10}', text)

<re.Match object; span=(36, 46), match='1234567890'>

In [10]:
re.search(r'\d{3}', text)

<re.Match object; span=(15, 18), match='789'>

In [11]:
re.findall(r'\d{3,10}', text)

['789', '1234567890']

In [12]:
re.findall(r'\w{4,}', text)

['phone', 'sorry', '1234567890', 'please', 'call']

### WildCardText

In [13]:
re.findall(r'p....', text)

['phone', 'pleas']

In [14]:
re.findall(r'...a..', text)

['please']

In [15]:
re.findall(r'[^\d]+', text)

['My phone no is ', ', ho sorry, it is ', ', please call me.']

In [16]:
re.findall(r'[^\D]+', text)

['789', '1234567890']

In [17]:
text1 = "Hi, this is praveen chakravarthy works in Artificial-Intelligence"

In [18]:
re.findall(r'[\w]+-[\w]+', text1)

['Artificial-Intelligence']

### Regular expression using Spacy

In [19]:
text = "Google announced a new pixel platform at Google I/O. Google I/O is a great platform to get all updates from Google."

In [21]:
pattern = [{'TEXT': 'Google'}, {'TEXT': 'I'}, {'TEXT': '/'}, {'TEXT':'O'}]

In [28]:
def callback_method(matcher, doc, matches):
    for match_id, start, end in matches:
        string_id = nlp.vocab.strings[match_id]
        span = doc[start:end]
        print(span.text)

In [25]:
matcher = Matcher(nlp.vocab)
matcher.add("Google",[pattern])

In [26]:
doc = nlp(text)
matches = matcher(doc)

In [30]:
callback_method(matcher, doc, matches)

Google I/O
