In [1]:
import spacy;

In [2]:
nlp = spacy.load ('en_core_web_sm');

In [3]:
def show_ents(doc):
    if doc.ents:
        for ent in doc.ents:
            print (ent.text + ' - ' +ent.label_ + ' - ' + str(spacy.explain(ent.label_)))
    else:
        print('No entities found');
        


In [4]:
doc = nlp(u'Hi how are you?')

In [5]:
show_ents(doc)

No entities found


In [6]:
doc = nlp (u"May I go to Patiala, Punjab next may to see the Sheesh Mahal?")

In [7]:
show_ents(doc)

Patiala - PERSON - People, including fictional
Punjab - PERSON - People, including fictional
the Sheesh Mahal - ORG - Companies, agencies, institutions, etc.


In [8]:
# wow, it did not get anything right at all; culturally ignorant

In [9]:
doc = nlp(u"Can I please have 500 dollars of Amazon stock to burn?")

In [10]:
show_ents(doc)

500 dollars - MONEY - Monetary values, including unit
Amazon - ORG - Companies, agencies, institutions, etc.


In [11]:
doc = nlp(u"Vanmoof to build a Indian garage for $7 million")

In [12]:
show_ents(doc)

Indian - NORP - Nationalities or religious or political groups
$7 million - MONEY - Monetary values, including unit


In [14]:
# add Vanmoof as an org since it didn't pop up!

In [15]:
from spacy.tokens import Span

In [17]:
ORG = doc.vocab.strings[u"ORG"]

In [18]:
ORG

381

In [19]:
new_ent = Span(doc, 0, 1, label=ORG)

In [20]:
doc.ents = list(doc.ents) + [new_ent]

In [23]:
# word added to vocab as an ORG! Going to add Patiala and Punjab too

In [26]:
 GPE = doc.vocab.string[u"GPE"]

AttributeError: 'spacy.vocab.Vocab' object has no attribute 'string'

In [27]:
# n e v e r m i n d

## PART 2

In [29]:
doc = nlp (u"Our company created a brand new vacuum cleaner." 
           u"This new vacuum-cleaner is state of the art.");

In [30]:
show_ents(doc)

No entities found


In [31]:
from spacy.matcher import PhraseMatcher 

In [32]:
matcher = PhraseMatcher(nlp.vocab)

In [33]:
phrase_list = ['vacuum cleaner', 'vacuum-cleaner']

In [34]:
phrase_patterns = [nlp(text) for text in phrase_list]

In [35]:
matcher.add('newproduct', None, *phrase_patterns)

In [36]:
found_matches = matcher(doc)

In [37]:
found_matches

[(2689272359382549672, 6, 8), (2689272359382549672, 11, 14)]

In [38]:
# create spans for each match and create named entities from them

In [39]:
from spacy.tokens import Span

In [40]:
PROD = doc.vocab.strings[u"PRODUCT"]

In [41]:
found_matches

[(2689272359382549672, 6, 8), (2689272359382549672, 11, 14)]

In [42]:
new_ents = [Span(doc, match[1], match[2], label=PROD) for match in found_matches]

In [51]:
doc_ents = list(doc.ents) + new_ents

In [52]:
show_ents(doc)

No entities found


In [48]:
doc

Our company created a brand new vacuum cleaner.This new vacuum-cleaner is state of the art.

In [50]:
PROD

384

In [54]:
# FIX ME: can't add for some reason :/

In [56]:
doc = nlp(u"Originally I paid $29.95 for this car toy, but now it is marked down by $10")

In [59]:
# let's count how many times money was mentioned
len([ent for ent in doc.ents if ent.label_ == "MONEY"])

2

## PART 3: Displacy

In [60]:
from spacy import displacy

In [63]:
doc = nlp(u"Over the last quarter Playdate sold nearly 20 thousand consoles for a profit of $3 million."
          u"By contrast, Nintendo only sold $8 million switch consoles.")

In [62]:
displacy.render(doc, style = 'ent', jupyter = True)

In [64]:
for sent in doc.sents:
    displacy.render(nlp(sent.text), style = 'ent', jupyter = True)

In [65]:
# let's customize our render style
options = {'ents': ['ORG']}

In [67]:
displacy.render(doc, style = 'ent', jupyter = True, options = options)

In [68]:
options = {'ents': ['ORG', 'DATE']}

In [69]:
displacy.render(doc, style = 'ent', jupyter = True, options = options)

In [80]:
colors = {'ORG': '#EEEEFF'}
options = {'ents': ['ORG', 'DATE'], 'colors': colors}

In [81]:
displacy.render(doc, style = 'ent', jupyter = True, options = options)

In [85]:
colors = {'ORG': 'radial-gradient(#00EEFF, #EEEEFF)'}
options = {'ents': ['ORG', 'DATE'], 'colors': colors}

In [86]:
displacy.render(doc, style = 'ent', jupyter = True, options = options)

In [91]:
colors = {'ORG': 'linear-gradient(90deg, #00EEFF, #EEEEFF)'}
options = {'ents': ['ORG', 'DATE'], 'colors': colors}

In [94]:
displacy.render(doc, style = 'ent', jupyter = True, options = options)

In [93]:
displacy.serve(doc, style = 'ent', options = options)


[93m    Serving on port 5000...[0m
    Using the 'ent' visualizer



127.0.0.1 - - [14/Feb/2023 19:23:50] "GET / HTTP/1.1" 200 1774
127.0.0.1 - - [14/Feb/2023 19:23:50] "GET /favicon.ico HTTP/1.1" 200 1774



    Shutting down server on port 5000.



## ~~~~~~~~~~ Sentence Segmentation ~~~~~~~~

In [95]:
doc = nlp (u"This is the first sentence. This is another sentence. This is the last sentence.")

In [96]:
doc[0]

This

In [97]:
doc.sents

<generator at 0x1b55a36e318>

In [98]:
for i in doc.sents:
    print (i)

This is the first sentence.
This is another sentence.
This is the last sentence.


In [99]:
doc.sents[0]

TypeError: 'generator' object is not subscriptable

In [100]:
type(list(doc.sents)[0])

spacy.tokens.span.Span

In [102]:
doc = nlp(u'"Management is doing the right things; leadership is doing the right things." - Peter Drucker')

In [103]:
doc.text

'"Management is doing the right things; leadership is doing the right things." - Peter Drucker'

In [106]:
for i in doc.sents:
    print (i, '\n \t')

"Management is doing the right things; leadership is doing the right things." 
 	
- Peter Drucker 
 	


In [120]:
# Add a segmentation rule to format correctly
def set_custom_boundaries(doc):
    # added "[:-1]" to compensate for buffer *
    for token in doc[:-1]:
        print (token.i, " ", token, "\n")  
        # adding new segmentation rule:
        if token.text == ';':
            # the "[:-1]" was added to compensate for +1 offset here
            doc[token.i + 1].is_sent_start = True
    return doc

In [122]:
nlp.add_pipe(set_custom_boundaries, before = 'parser')
nlp.pipe_names

['tagger', 'set_custom_boundaries', 'parser', 'ner']

In [124]:
doc[:-1]

"Management is doing the right things; leadership is doing the right things." - Peter

In [126]:
doc4 = nlp(u'"Management is doing the right things; leadership is doing the right things." - Peter Drucker')

0   " 

1   Management 

2   is 

3   doing 

4   the 

5   right 

6   things 

7   ; 

8   leadership 

9   is 

10   doing 

11   the 

12   right 

13   things 

14   . 

15   " 

16   - 

17   Peter 



In [128]:
for i in doc4.sents:
    print (i)    

"Management is doing the right things;
leadership is doing the right things.
" - Peter Drucker


In [108]:
# Change segmentation rules

In [130]:
# reloading to reset pipeline and remove custom boundaries
nlp = spacy.load('en_core_web_sm')

In [131]:
my_str = u"This is a sentence. This is another.\n\nThis is a \nthird sentence."

In [132]:
print (my_str)

This is a sentence. This is another.

This is a 
third sentence.


In [133]:
# the periods might not be as important as linebreaks in poetry; 
# therefore we should change the line-break structure for those type of datasets

In [134]:
doc = nlp(my_str)

In [136]:
for i in doc.sents:
    print (i)

This is a sentence.
This is another.


This is a 
third sentence.


In [137]:
from spacy.pipeline import SentenceSegmenter

In [139]:
def split_on_newlines(doc):
    start = 0;
    seen_newline = False
    for word in doc:
        # yield to start token of the word, reset the word's token as a start of a line
        if seen_newline:
            yield doc[start:word.i]
            start = word.i
            seen_newline = False
        # to establish the \n token as a new sentence boundary
        elif word.text.startswith('\n'):
            seen_newline = True
    yield doc[start:]

In [141]:
sbd = SentenceSegmenter(nlp.vocab, strategy = split_on_newlines)

In [142]:
nlp.add_pipe(sbd);
nlp.pipe_names

['tagger', 'parser', 'ner', 'sbd']

In [143]:
doc = nlp(my_str)

In [144]:
for i in doc.sents:
    print (i)

This is a sentence. This is another.


This is a 

third sentence.


In [145]:
# notice that the periods are no longer sentence splitters! but the \n are!