#### USING LINGUISTIC ANNOTATIONS

In [1]:
import spacy
from spacy.matcher import Matcher
from spacy.tokens import Span
from spacy import displacy
import re

In [2]:
nlp = spacy.load('en_core_web_md')

In [3]:
matcher = Matcher(nlp.vocab)

In [4]:
matched_sents = []

In [5]:
pattern = [{"LOWER" : "facebook"},{"LEMMA":"be"},{"POS":"ADV","OP":"*"},{"POS":"ADJ"}]

In [6]:
def callback(matcher,doc,i,matches):
    matcher_id,start,end = matches[i]
    span = doc[start:end]
    sent = span.sent
    
    match_ents=[{
        'start' : span.start_char-sent.start_char,
        'end' : span.end_char-sent.start_char,
        'label' : 'MATCH'
    }]
    
    matched_sents.append({'text':sent.text,'ents':match_ents})

In [7]:
matcher.add("fb",callback,pattern)

In [8]:
doc= nlp("I'd say that Facebook is evil. - Facebook is pretty cool, right ? ")

In [9]:
matches = matcher(doc)
matches

[(8017838677478259815, 4, 7), (8017838677478259815, 9, 13)]

In [10]:
matched_sents

[{'text': "I'd say that Facebook is evil.",
  'ents': [{'start': 13, 'end': 29, 'label': 'MATCH'}]},
 {'text': '- Facebook is pretty cool, right ?',
  'ents': [{'start': 2, 'end': 25, 'label': 'MATCH'}]}]

In [11]:
displacy.render(matched_sents, style ='ent', manual =True)

#### PHONE NUMBERS

In [12]:
pattern = [{"ORTH":"("},{"SHAPE":"ddd"},{"ORTH":")"},{"SHAPE":"dddd"},{"ORTH":"-","OP":"?"},{"SHAPE":"dddd"}]

In [13]:
matcher = Matcher(nlp.vocab)
matcher.add("PhoneNumber",None,pattern)

In [14]:
doc = nlp("call me at (123) 4567 7789")
print([t.text for t in doc])

['call', 'me', 'at', '(', '123', ')', '4567', '7789']


In [15]:
matches=matcher(doc)
matches

[(7978097794922043545, 3, 8)]

#### EMAIL ADDRESS MATCHING

In [16]:
pattern = [{"TEXT":{"REGEX":"[a-zA-Z0-9-._]+@[a-zA-Z0-9-_.]+"}}]

In [17]:
matcher = Matcher(nlp.vocab)
matcher.add("Email",None,pattern)

In [18]:
doc = nlp("Email me at abc123@gmail.com and def_gh@klm.in")

In [19]:
matches= matcher(doc)
matches

[(11010771136823990775, 3, 4), (11010771136823990775, 5, 6)]

In [20]:
for match_id,start,end in matches:
    span = doc[start:end]
    print(span.text)

abc123@gmail.com
def_gh@klm.in


#### EMOJI SENTIMENT ANALYSIS

In [21]:
pos_emoji = ['😊','🤣','😂','😍','😃']
neg_emoji = ['😐','😫','😭','😢','😩']

In [22]:
pos_patterns = [[{"ORTH":emoji}] for emoji in pos_emoji]
neg_patterns = [[{"ORTH":emoji}] for emoji in neg_emoji]

In [23]:
pos_patterns

[[{'ORTH': '😊'}],
 [{'ORTH': '🤣'}],
 [{'ORTH': '😂'}],
 [{'ORTH': '😍'}],
 [{'ORTH': '😃'}]]

In [24]:
def label_sentiment(matcher, doc,i ,matches):
    match_id, start, end = matches[i]
    if doc.vocab.strings[match_id] == 'HAPPY':
        doc.sentiment += 0.1
    elif doc.vocab.strings[match_id] == 'SAD':
        doc.sentiment -= 0.1

In [25]:
matcher = Matcher(nlp.vocab)

In [26]:
matcher.add("HAPPY",label_sentiment,*pos_patterns)
matcher.add("SAD",label_sentiment,*neg_patterns)

In [27]:
matcher.add("HASHTAG",None,[{"TEXT":"#"},{"IS_ASCII":True}])

In [28]:
doc =  nlp("Hello World 😊 Have a great day! #THUGLIFE")

In [29]:
matches= matcher(doc)
matches

[(2686646543460454932, 2, 3), (16536914698459818706, 8, 10)]

In [30]:
for match_id,start,end in matches:
    string_id=doc.vocab.strings[match_id]
    span = doc[start:end]
    print(string_id,span.text)

HAPPY 😊
HASHTAG #THUGLIFE


#### EFFICIENT PHASE MATCHER

In [31]:
from spacy.matcher import PhraseMatcher

In [32]:
matcher = PhraseMatcher(nlp.vocab)

In [33]:
terms = ['BARAC OBAMA','ANGELA MERKEL','WASHINGTON D.C.']

In [34]:
pattern = [nlp.make_doc(text) for text in terms]
pattern

[BARAC OBAMA, ANGELA MERKEL, WASHINGTON D.C.]

In [35]:
matcher.add("term",None, *pattern)

In [36]:
doc = nlp("German Chancellor ANGELA MERKEL and US President BARAC OBAMA converse in the Oval Office inside the White House in WASHINGTON D.C.")

In [37]:
matches= matcher(doc)
matches

[(4519742297340331040, 2, 4),
 (4519742297340331040, 7, 9),
 (4519742297340331040, 19, 21)]

In [38]:
for match_id,start,end in matches:
    span = doc[start:end]
    print(span.text)

ANGELA MERKEL
BARAC OBAMA
WASHINGTON D.C.


#### USING ENTITY RULES

In [39]:
from spacy.pipeline import EntityRuler

In [40]:
ruler = EntityRuler(nlp)

In [41]:
patterns = [{"label":"ORG","pattern":"KING ORGS"},
            {"label":"GPE","pattern":[{"LOWER":"san"},{"LOWER":"francisco"}]}]

In [42]:
ruler.add_patterns(patterns)

In [43]:
nlp.add_pipe(ruler)

In [44]:
doc = nlp("KING ORGS is opening is mouth for his first orientation speech at san francisco")

In [45]:
for ent in doc.ents:
    print(f'{ent.text:<{20}} {ent.label_}')

KING ORGS            ORG
first                ORDINAL
san francisco        GPE
