In [77]:
#!pip install -U spacy
#!pip install -U spacy-lookups-data
#!python -m spacy download en_core_web_sm

In [78]:
import spacy
from spacy.matcher import Matcher
from spacy.tokens import Span
from spacy import displacy

In [79]:
nlp = spacy.load('en_core_web_sm')

In [80]:
doc = nlp("Hello World!")
doc

Hello World!

In [81]:
for token in doc:
  print(token)

Hello
World
!


In [97]:
pattern = [{"LOWER":"hello", "OP":"?"},
           {"IS_PUNCT":True, "OP":"?"},
           {"LOWER":"world"}]

In [98]:
matcher = Matcher(nlp.vocab)
matcher.add('HelloWorld', [pattern])

In [99]:
matches = matcher(doc)

In [100]:
for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id]
    span = doc[start:end]
    print(match_id, string_id, start, end, span.text)

15578876784678163569 HelloWorld 0 3 Hello, World
15578876784678163569 HelloWorld 1 3 , World
15578876784678163569 HelloWorld 2 3 World


In [101]:
doc = nlp('Hello, World!')

In [102]:
matches = matcher(doc)
matches

[(15578876784678163569, 0, 3),
 (15578876784678163569, 1, 3),
 (15578876784678163569, 2, 3)]

In [103]:
for token in doc:
  print(token)

Hello
,
World
!


In [104]:
for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id]
    span = doc[start:end]
    print(match_id, string_id, start, end, span.text)

15578876784678163569 HelloWorld 0 3 Hello, World
15578876784678163569 HelloWorld 1 3 , World
15578876784678163569 HelloWorld 2 3 World


* https://demos.explosion.ai/matcher
* https://demos.explosion.ai/displacy?text=hello%20world!&model=en_core_web_sm&cpu=0&cph=0


In [105]:
text = "my phone number is 1234. Ohh its wrong! Correct one is 1234567890. Call me!"

###Regular Expression

In [106]:
import re

In [107]:
re.search(r'\d{4,10}', text)

<re.Match object; span=(19, 23), match='1234'>

In [108]:
re.findall(r'\d{4,10}', text)

['1234', '1234567890']

In [109]:
re.findall(r'\w{4}', text)

['phon', 'numb', '1234', 'wron', 'Corr', '1234', '5678', 'Call']

In [110]:
re.findall(r'\w{4,}', text)

['phone', 'number', '1234', 'wrong', 'Correct', '1234567890', 'Call']

Wildcard Text

In [112]:
re.findall(r'p...', text)

['phon']

In [113]:
re.findall(r'c...', text)

['ct o']

In [114]:
text = "this is cat but not that. i want hat and cat both"

In [115]:
re.findall(r'.a.', text)

['cat', 'hat', 'wan', 'hat', ' an', 'cat']

In [116]:
text = "hi thanks for reading <3"

In [117]:
re.findall(r'\d$', text)

['3']

In [119]:
text = "3 hi thanks for reading <3"

In [120]:
re.findall(r'^\d', text)

['3']

### Exclusion

In [122]:
text

'3 hi thanks for reading <3'

In [124]:
re.findall(r'[^\d]+', text)

[' hi thanks for reading <']

In [129]:
re.findall(r'[^\D]+', text)

['3', '3']

In [128]:
re.findall(r'[^\w]+', text)

[' ', ' ', ' ', ' ', ' <']

In [127]:
re.findall(r'[^\W]+', text)

['3', 'hi', 'thanks', 'for', 'reading', '3']

In [132]:
text = "you can get free-classes on great account of kgp-talkie. Thank you Laxmi"

In [133]:
re.findall(r'[\w]+-[\w]+', text)

['free-classes', 'kgp-talkie']

## Regular Expression in SpaCy
* Match Differen Spelling of Token Texts `pattern = [{"TEXT"}: {"REGEX":"deff?in[ia]tely"}}]`
* Match Tokens fith Fine-Grained POS Tags starting with 'V' `pattern = [{"TAG"}: {"REGEX":"^V"}}]`


In [144]:
text = "Google announced a new Pixel at Google I/O . Google I/O is a great place to get all updates from Google"

In [145]:
pattern = [{"TEXT": "Google"},
           {"TEXT": "I"},
           {"TEXT": "/"},
           {"TEXT": "O"}]

In [146]:
def callback_method(matcher, doc, i, matches):
  match_id, start, end = matches[i]
  entity = doc[start:end]
  print(entity.text)

In [147]:
matcher = Matcher(nlp.vocab)
matcher.add('GoogleIO', [pattern], on_match=callback_method)

In [148]:
doc = nlp(text)

In [149]:
matches = matcher(doc)

Google I/O
Google I/O


### Find Word Google

In [150]:
pattern = [{"TEXT": "Google"},
           {"TEXT": "I", "OP": "?"},
           {"TEXT": "/", "OP": "?"},
           {"TEXT": "O", "OP": "?"}]

In [151]:
def callback_method(matcher, doc, i, matches):
  match_id, start, end = matches[i]
  entity = doc[start:end]
  print(entity.text)

In [152]:
matcher = Matcher(nlp.vocab)
matcher.add('GoogleIO', [pattern], on_match=callback_method)

In [153]:
doc = nlp(text)

In [154]:
matches = matcher(doc)

Google
Google
Google I
Google I/
Google I/O
Google
Google I
Google I/
Google I/O
Google
