In [95]:
#!pip install -U spacy
#!pip install -U spacy-lookups-data
#!python -m spacy download en_core_web_sm

In [96]:
import spacy
from spacy.matcher import Matcher
from spacy.tokens import Span
from spacy import displacy

In [97]:
nlp = spacy.load('en_core_web_sm')

In [98]:
doc = nlp("Hello World!")
doc

Hello World!

In [99]:
for token in doc:
  print(token)

Hello
World
!


In [100]:
pattern = [{"LOWER":"hello", "OP":"?"},
           {"IS_PUNCT":True, "OP":"?"},
           {"LOWER":"world"}]

In [101]:
matcher = Matcher(nlp.vocab)
matcher.add('HelloWorld', [pattern])

In [102]:
matches = matcher(doc)

In [103]:
for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id]
    span = doc[start:end]
    print(match_id, string_id, start, end, span.text)

15578876784678163569 HelloWorld 0 2 Hello World
15578876784678163569 HelloWorld 1 2 World


In [104]:
doc = nlp('Hello, World!')

In [105]:
matches = matcher(doc)
matches

[(15578876784678163569, 0, 3),
 (15578876784678163569, 1, 3),
 (15578876784678163569, 2, 3)]

In [106]:
for token in doc:
  print(token)

Hello
,
World
!


In [107]:
for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id]
    span = doc[start:end]
    print(match_id, string_id, start, end, span.text)

15578876784678163569 HelloWorld 0 3 Hello, World
15578876784678163569 HelloWorld 1 3 , World
15578876784678163569 HelloWorld 2 3 World


- https://demos.explosion.ai/matcher
- https://demos.explosion.ai/displacy?text=hello%20world!&model=en_core_web_sm&cpu=0&cph=0

In [108]:
text = "my phone number is 1234. Ohh its wrong! Correct one is 1234567890. Call me!"

#### Regular Expression

In [109]:
import re

In [110]:
re.search(r'\d{4,10}', text)

<re.Match object; span=(19, 23), match='1234'>

In [111]:
re.findall(r'\d{4,10}', text)

['1234', '1234567890']

In [112]:
re.findall(r'\w{4}', text)

['phon', 'numb', '1234', 'wron', 'Corr', '1234', '5678', 'Call']

In [113]:
re.findall(r'\w{4,}', text)

['phone', 'number', '1234', 'wrong', 'Correct', '1234567890', 'Call']

Wildcard Text

In [114]:
re.findall(r'p...', text)

['phon']

In [115]:
re.findall(r'c...', text)

['ct o']

In [116]:
text = "this is cat but not that. i want hat and cat both"
re.findall(r'.a.', text)

['cat', 'hat', 'wan', 'hat', ' an', 'cat']

In [117]:
text = "hi thanks for reading <3"
re.findall(r'\d$', text)

['3']

In [118]:
text = "3 hi thanks for reading <3"
re.findall(r'^\d', text)

['3']

#### Exclusion

In [119]:
text

'3 hi thanks for reading <3'

In [120]:
re.findall(r'[^\d]+', text)

[' hi thanks for reading <']

In [121]:
re.findall(r'[^\D]+', text)

['3', '3']

In [122]:
re.findall(r'[^\w]+', text)

[' ', ' ', ' ', ' ', ' <']

In [123]:
re.findall(r'[^\W]+', text)

['3', 'hi', 'thanks', 'for', 'reading', '3']

In [124]:
text = "you can get free-classes on great account of kgp-talkie. Thank you Laxmi"
re.findall(r'[\w]+-[\w]+', text)

['free-classes', 'kgp-talkie']