In [1]:
#!pip install -U spacy
#!pip install -U spacy-lookups-data
#!python -m spacy download en_core_web_sm

In [2]:
import spacy
from spacy.matcher import Matcher
from spacy.tokens import Span
from spacy import displacy

In [3]:
nlp = spacy.load('en_core_web_sm')

In [4]:
doc = nlp("Hello World!")
doc

Hello World!

In [5]:
for token in doc:
  print(token)

Hello
World
!


In [6]:
pattern = [{"LOWER":"hello", "OP":"?"},
           {"IS_PUNCT":True, "OP":"?"},
           {"LOWER":"world"}]

In [7]:
matcher = Matcher(nlp.vocab)
matcher.add('HelloWorld', [pattern])

In [8]:
matches = matcher(doc)

In [9]:
for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id]
    span = doc[start:end]
    print(match_id, string_id, start, end, span.text)

15578876784678163569 HelloWorld 0 2 Hello World
15578876784678163569 HelloWorld 1 2 World


In [10]:
doc = nlp('Hello, World!')

In [11]:
matches = matcher(doc)
matches

[(15578876784678163569, 0, 3),
 (15578876784678163569, 1, 3),
 (15578876784678163569, 2, 3)]

In [12]:
for token in doc:
  print(token)

Hello
,
World
!


In [13]:
for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id]
    span = doc[start:end]
    print(match_id, string_id, start, end, span.text)

15578876784678163569 HelloWorld 0 3 Hello, World
15578876784678163569 HelloWorld 1 3 , World
15578876784678163569 HelloWorld 2 3 World


* https://demos.explosion.ai/matcher
* https://demos.explosion.ai/displacy?text=hello%20world!&model=en_core_web_sm&cpu=0&cph=0


In [14]:
text = "my phone number is 1234. Ohh its wrong! Correct one is 1234567890. Call me!"

### Regular Expression

In [15]:
import re

In [16]:
re.search(r'\d{4,10}', text)

<re.Match object; span=(19, 23), match='1234'>

In [17]:
re.findall(r'\d{4,10}', text)

['1234', '1234567890']

In [18]:
re.findall(r'\w{4}', text)

['phon', 'numb', '1234', 'wron', 'Corr', '1234', '5678', 'Call']

In [19]:
re.findall(r'\w{4,}', text)

['phone', 'number', '1234', 'wrong', 'Correct', '1234567890', 'Call']

Wildcard Text

In [20]:
re.findall(r'p...', text)

['phon']

In [21]:
re.findall(r'c...', text)

['ct o']

In [22]:
text = "this is cat but not that. i want hat and cat both"

In [23]:
re.findall(r'.a.', text)

['cat', 'hat', 'wan', 'hat', ' an', 'cat']

In [24]:
text = "hi thanks for reading <3"

In [25]:
re.findall(r'\d$', text)

['3']

In [26]:
text = "3 hi thanks for reading <3"

In [27]:
re.findall(r'^\d', text)

['3']

### Exclusion

In [28]:
text

'3 hi thanks for reading <3'

In [29]:
re.findall(r'[^\d]+', text)

[' hi thanks for reading <']

In [30]:
re.findall(r'[^\D]+', text)

['3', '3']

In [31]:
re.findall(r'[^\w]+', text)

[' ', ' ', ' ', ' ', ' <']

In [32]:
re.findall(r'[^\W]+', text)

['3', 'hi', 'thanks', 'for', 'reading', '3']

In [33]:
text = "you can get free-classes on great account of kgp-talkie. Thank you Laxmi"

In [34]:
re.findall(r'[\w]+-[\w]+', text)

['free-classes', 'kgp-talkie']

## Regular Expression in SpaCy
* Match Differen Spelling of Token Texts `pattern = [{"TEXT"}: {"REGEX":"deff?in[ia]tely"}}]`
* Match Tokens fith Fine-Grained POS Tags starting with 'V' `pattern = [{"TAG"}: {"REGEX":"^V"}}]`


In [35]:
text = "Google announced a new Pixel at Google I/O . Google I/O is a great place to get all updates from Google"

In [36]:
pattern = [{"TEXT": "Google"},
           {"TEXT": "I"},
           {"TEXT": "/"},
           {"TEXT": "O"}]

In [37]:
def callback_method(matcher, doc, i, matches):
  match_id, start, end = matches[i]
  entity = doc[start:end]
  print(entity.text)

In [38]:
matcher = Matcher(nlp.vocab)
matcher.add('GoogleIO', [pattern], on_match=callback_method)

In [39]:
doc = nlp(text)

In [40]:
matches = matcher(doc)

Google I/O
Google I/O


### Find Word Google

In [41]:
pattern = [{"TEXT": "Google"},
           {"TEXT": "I", "OP": "?"},
           {"TEXT": "/", "OP": "?"},
           {"TEXT": "O", "OP": "?"}]

In [42]:
def callback_method(matcher, doc, i, matches):
  match_id, start, end = matches[i]
  entity = doc[start:end]
  print(entity.text)

In [43]:
matcher = Matcher(nlp.vocab)
matcher.add('GoogleIO', [pattern], on_match=callback_method)

In [44]:
doc = nlp(text)

In [45]:
matches = matcher(doc)

Google
Google
Google I
Google I/
Google I/O
Google
Google I
Google I/
Google I/O
Google


# PART-2 : Phone Number, Email, Emoji Extraction

> `[{"LOWER":"facebook"},{"LEMMA":"be"},{"POS":"ADV","OP":"*"},{"POS":"ADJ"}]`
- This translates to a token whose lowercase from matches "facebook" (like Facebook, facebook or FACEBOOK), followed by a token with the lemma "be" (for example; is, was, or`s), followed by an optional adverb, followed by an adjective.

In [46]:
matcher = Matcher(nlp.vocab)

In [47]:
matched_sents = []

In [48]:
pattern = [{"LOWER":"facebook"},{"LEMMA":"be"},{"POS":"ADV","OP":"*"},{"POS":"ADJ"}]

In [49]:
def callback_method_fb(matcher, doc, i, matches):
  matched_id, start, end = matches[i]
  span = doc[start:end]
  sent = span.sent

  match_ents = [{
      "start": span.start_char - sent.start_char,
      "end": span.end_char - sent.start_char,
      "label": "MATCH"
  }]

  matched_sents.append({"text": sent.text, "ents": match_ents})

In [50]:
matcher.add("fb", [pattern], on_match=callback_method_fb)

In [51]:
doc = nlp("I'd say that Facebook is evil. - Facebook is pretty cool, right?")

In [52]:
matches = matcher(doc)
matches

[(8017838677478259815, 4, 7), (8017838677478259815, 9, 13)]

In [53]:
matched_sents

[{'text': "I'd say that Facebook is evil.",
  'ents': [{'start': 13, 'end': 29, 'label': 'MATCH'}]},
 {'text': '- Facebook is pretty cool, right?',
  'ents': [{'start': 2, 'end': 25, 'label': 'MATCH'}]}]

In [54]:
displacy.render(matched_sents, style="ent", manual=True)

## Phone Number Extraction
- Phone Numbers can have many different formats and matching them is often tricky. During `Tokenization`, spaCy will leave sequences of numbers intact and only split on whitespace and punctuation. This means that your match pattern will have to look out for number sequences of a certain length, surrounded by specific punctuation - depending on the national conventions.

- You want to match like this : `(123) 4567 8901 or (123) 4567-8901`
- [{"ORTH":"("}, {"SHAPE":"ddd"}, {"ORTH":")"}, {"SHAPE":"dddd"}, {"ORTH":"-", "OP":"?"}, {"SHAPE":"dddd"}]

In [55]:
pattern = [{"ORTH":"("}, {"SHAPE":"ddd"}, {"ORTH":")"}, {"SHAPE":"dddd"}, {"ORTH":"-", "OP":"?"}, {"SHAPE":"dddd"}]

In [56]:
matcher = Matcher(nlp.vocab)
matcher.add("PhoneNumber", [pattern])

In [57]:
doc = nlp("Call me at (123) 4560-7890")

In [58]:
print([t.text for t in doc])

['Call', 'me', 'at', '(', '123', ')', '4560', '-', '7890']


In [59]:
matches = matcher(doc)
matches

[(7978097794922043545, 3, 9)]

In [60]:
for match_id, start, end in matches:
  span = doc[start:end]
  print(span.text)

(123) 4560-7890


## E-Mail Address Matching

In [61]:
pattern = [{"TEXT": {"REGEX": "[a-zA-Z0-9-_.]+@[a-zA-Z0-9-_.]+"}}]

In [62]:
matcher = Matcher(nlp.vocab)
matcher.add("EMail", [pattern])

In [63]:
text = "Email me at email2me@colemerg.com and talk.me@rido.com"

In [64]:
doc = nlp(text)

In [65]:
matches = matcher(doc)
matches

[(10208870021556473826, 3, 4), (10208870021556473826, 5, 6)]

In [66]:
for match_id, start, end in matches:
  span = doc[start:end]
  print(span.text)

email2me@colemerg.com
talk.me@rido.com


## Hashtags and Emoji on Social Media
- Social media posts, especially tweets, can be difficult to work with. They're very short and often contain various emoji and hashtags. By only looking at the plain text, you'll lose a lot of valuable semantic information.
- Let's say you've extracted a large sample of social media posts and specific topic, for example posts mentioning a brand name or product. As the fist step of your data exploration, you want to filter out posts containing certain emoji and use them to assign a general sentiment score, based on wheter the expressed emotion is positive or negative, e.g. 😊 or 😢. You also want to find, marge and label hastags like #MondayMotivation, to be able to ignore analyze them later.
- By default, spaCy's tokenizer will split emoji into seperate tokens. This means that you can create a pattern for one or more emoji tokens. Valid hashtags usually consist of a #, plus a sequence of ASCII characters with no whitespace, making them easy to match as well.

In [67]:
pos_emoji = ["😊", "😃", "😄", "😁", "😆", "😅"] #positive_emoji
neg_emoji = ["😞", "😔", "😟", "😕", "🙁", "☹️"] #negative_emoji

In [68]:
print('positive emoji:',pos_emoji)
print('negative emoji:',neg_emoji)

positive emoji: ['😊', '😃', '😄', '😁', '😆', '😅']
negative emoji: ['😞', '😔', '😟', '😕', '🙁', '☹️']


In [69]:
# add patterns to match one or more emoji tokens
pos_patterns = [[{"ORTH": emoji}] for emoji in pos_emoji]
neg_patterns = [[{"ORTH": emoji}] for emoji in neg_emoji]

In [70]:
pos_patterns

[[{'ORTH': '😊'}],
 [{'ORTH': '😃'}],
 [{'ORTH': '😄'}],
 [{'ORTH': '😁'}],
 [{'ORTH': '😆'}],
 [{'ORTH': '😅'}]]

In [71]:
 neg_patterns

[[{'ORTH': '😞'}],
 [{'ORTH': '😔'}],
 [{'ORTH': '😟'}],
 [{'ORTH': '😕'}],
 [{'ORTH': '🙁'}],
 [{'ORTH': '☹️'}]]

In [72]:
def label_sentiment(matcher, doc, i, matches):
  match_id, start, end = matches[i]
  if doc.vocab.strings[match_id] == "HAPPY":
    doc.sentiment += 0.1
  elif doc.vocab.strings[match_id] =='SAD':
    doc.sentiment -= 0.1

In [73]:
matcher = Matcher(nlp.vocab)

In [74]:
matcher.add("HAPPY", pos_patterns, on_match=label_sentiment)
matcher.add("SAD", neg_patterns, on_match=label_sentiment)

In [75]:
matcher.add("HASHTAG", [[{"ORTH":"#"}, {"IS_ASCII":True}]])

In [76]:
doc = nlp("Hello world 😊 #MondayMotivation")

In [77]:
matches = matcher(doc)

In [78]:
for match_id, start, end in matches:
  string_id = doc.vocab.strings[match_id] # look up string ID
  span = doc[start:end]
  print(string_id, span.text)

HAPPY 😊
HASHTAG #MondayMotivation


## Efficient Phrase Matching
- If you need to match large technology list, you dan also use the PhraseMatcher and create Doc objects instead of token patterns, which is much more efficient overall. The Doc patterns can contain single or multiple tokens.

In [79]:
from spacy.matcher import PhraseMatcher

In [80]:
matcher = PhraseMatcher(nlp.vocab)

In [81]:
terms = ["BARACK OBAMA", "ANGELA MERKEL", "WASHINGTON D.C."]

In [82]:
pattern = [nlp.make_doc(text) for text in terms]
pattern

[BARACK OBAMA, ANGELA MERKEL, WASHINGTON D.C.]

In [83]:
matcher.add("term", pattern)

In [84]:
doc = nlp("German Chancellor ANGELA MERKEL and US President BARACK OBAMA converse in the Oval Office inside the White House in WASHINGTON D.C.")
doc

German Chancellor ANGELA MERKEL and US President BARACK OBAMA converse in the Oval Office inside the White House in WASHINGTON D.C.

In [85]:
matches = matcher(doc)

In [86]:
for match_id, start, end in matches:
  span = doc[start:end]
  print(span.text)

ANGELA MERKEL
BARACK OBAMA
WASHINGTON D.C.


In [87]:
matches

[(4519742297340331040, 2, 4),
 (4519742297340331040, 7, 9),
 (4519742297340331040, 19, 21)]

## Custome Rule Based Entity Recognition
- The EntityRuler is an exciting new component that lets you add named entities based on pattern dictionaries, and makes it easy to combine rule-based and statistical named entity recognition for even more powerful models.

#### Entity Patterns
- Entity patterns are dictionaries with two keys: "label", specifying the label to assign to the entity if the pattern is matched, and "patter", the match pattern. The entity ruler accepts two types of patterns:


1.   Phrase Pattern `{"label":"ORG", "pattern":"Apple"}`
2.   Token Pattern `{"label":"GPE", "pattern":[{"LOWER":"san"},{{"LOWER":"francisco"}]}`

#### Using the Entity Ruler
- The EntityRuler is a pipeline component that's typically added via nlp_add.pipe. When the nlp object is called on a text, it will find matches in the doc and add them as entities to the doc.ents, using the specified pattern label as the entity label

* https://spacy.io/api/annotation#named-entities

In [88]:
from spacy.pipeline import EntityRuler

In [89]:
nlp = spacy.load("en_core_web_sm")

In [90]:
ruler = nlp.add_pipe("entity_ruler", name="entity_ruler", before="ner")

In [91]:
patterns = [{"label":"ORG", "pattern":"Apple"},
          {"label":"GPE", "pattern":[{"LOWER":"san"}, {"LOWER":"francisco"}]}]
patterns

[{'label': 'ORG', 'pattern': 'Apple'},
 {'label': 'GPE', 'pattern': [{'LOWER': 'san'}, {'LOWER': 'francisco'}]}]

In [92]:
ruler.add_patterns(patterns)

In [93]:
doc = nlp("Apple is opening a new store in San Francisco")

In [94]:
print([(ent.text, ent.label_) for ent in doc.ents])

[('Apple', 'ORG'), ('San Francisco', 'GPE')]
