Parts of Speech Tagging

In [102]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [2]:
doc = nlp(u"The quick brown fox jumped over the lazy dog's back.")

In [7]:
print(doc[4].pos_) # get the part of speech tag
print(doc[4].tag_) # the fine grain tag
print(spacy.explain(doc[4].tag_)) # explains what the fine grain tag means

VERB
VBD
verb, past tense


In [16]:
doc2 = nlp(u"I read books")
doc3 = nlp(u"I have read a book")

print(spacy.explain(doc2[1].tag_))
print(spacy.explain(doc3[2].tag_))

verb, non-3rd person singular present
verb, past participle


In [25]:
doc4 = nlp(u"The quick brown fox jumped over the lazy dog's back.")

POS_counts = doc.count_by(spacy.attrs.POS) # can do similar for tags (use TAG)
print(POS_counts)
print(doc.vocab[84].text) # so it has 3 adjectives

{90: 2, 84: 3, 92: 3, 100: 1, 85: 1, 94: 1, 97: 1}
ADJ


In [26]:
from spacy import displacy

In [27]:
displacy.render(doc4, style='dep',jupyter=True)

In [30]:
doc5 = nlp(u"This is a sentence. This is a possibly longer sentence")
spans = list(doc5.sents)
# no need to use in jupyter, can just use render 
displacy.serve(spans, style = 'dep')




Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...



127.0.0.1 - - [02/Feb/2022 21:28:26] "GET / HTTP/1.1" 200 8124


Shutting down server on port 5000.


NAMED ENTITY RECOGNITION (NER)<br>
- seeks to locate and classify named entity mentionsin unstructured text into pre-defined categories

In [33]:
def show_ents(doc):
    if doc.ents:
        for ent in doc.ents:
            print(ent.text + ' - ' + ent.label_ + ' - ' + str(spacy.explain(ent.label_)))
    else:
        print('No entities found')

In [43]:
doc6 = nlp(u"Hi how are you?")
doc7 = nlp(u"May I go to London, United Kingdom next May? I will need £500")

show_ents(doc6)
print("\n")
show_ents(doc7)

No entities found


London - GPE - Countries, cities, states
United Kingdom - GPE - Countries, cities, states
next May - DATE - Absolute or relative dates or periods
500 - MONEY - Monetary values, including unit


In [51]:
doc8 = nlp(u"Tesla to build U.K. factory for £10 billion")
show_ents(doc8)
# does not recognise Tesla

U.K. - GPE - Countries, cities, states
£10 billion - MONEY - Monetary values, including unit


In [52]:
from spacy.tokens import Span

In [53]:
# add a new entity
ORG = doc8.vocab.strings[u"ORG"]
new_ent = Span(doc8, 0, 1, label=ORG) # says where the word occurs, only changes for this occurences (see below for all)
doc8.ents = list(doc8.ents) + [new_ent]

show_ents(doc8)

Tesla - ORG - Companies, agencies, institutions, etc.
U.K. - GPE - Countries, cities, states
£10 billion - MONEY - Monetary values, including unit


In [70]:
doc9 = nlp(u"Our company created a brand new vacuum cleaner. This new vacuum-cleaner is the best")
show_ents(doc9)

from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)
phrase_list = ['vacuum cleaner', 'vacuum-cleaner']
phrase_patterns = [nlp(text) for text in phrase_list]

matcher.add('newProduct', phrase_patterns)

found_matches = matcher(doc9)
PROD = doc9.vocab.strings[u"PRODUCT"] # want to make the phrases become a product
new_ents = [Span(doc9, match[1], match[2], label=PROD) for match in found_matches]

doc9.ents = list(doc9.ents) + new_ents
print("\n")
show_ents(doc9)

No entities found


vacuum cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)
vacuum-cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)


In [76]:
doc10 = nlp(u"Originally I paid £5 but now it is marked down for £3, I want a refund of £2!")

len([ent for ent in doc10.ents if ent.label_ == "MONEY"]) # see how many MONEY entities exist!

3

In [77]:
doc11 = nlp(u"Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $10 billion")
displacy.render(doc11, style='ent',jupyter=True)

SENTENCE SEGMENTATION

In [79]:
doc12 = nlp(u"This is the first sentence. This is the second. This is the third sentence")
for sent in doc12.sents:
    print(sent)

This is the first sentence.
This is the second.
This is the third sentence


In [80]:
doc13 = nlp(u'"Management is doing the right things; leadership is doing the right things." - Peter')

In [83]:
for sent in doc13.sents:
    print(sent)
    print("\n")

"Management is doing the right things; leadership is doing the right things."


- Peter




In [95]:
# ADD A SEGMENTATION RULE
from spacy.language import Language
@Language.component('custom_boundaries')
def set_custom_boundaries(doc):
    for token in doc[:-1]: # every token but not the last one
        if token.text == ";":
            doc[token.i+1].is_sent_start = True # each token contains its index so we can use that
    return doc
            
nlp.add_pipe('custom_boundaries', before='parser')
nlp.pipe_names

doc14 = nlp(u'"Management is doing the right things; leadership is doing the right things." - Peter')
for sent in doc14.sents: # now splits on the semicolon as well
    print(sent)

"Management is doing the right things;
leadership is doing the right things."
- Peter


In [96]:
mystring = u"This is a sentence. This is another.\n\nThis is a \nthird sentence."
doc15 = nlp(mystring)
for sentence in doc15.sents:
    print(sentence)

This is a sentence.
This is another.


This is a 
third sentence.


In [103]:
# can also remove and change the current rules as well as add new ones if needed