<a href="https://www.kaggle.com/code/sagorkumarmitra/part-of-speech-tagging-named-entity-recognition?scriptVersionId=190345904" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
import spacy
nlp = spacy.load('en_core_web_sm')

doc = nlp(u"The quick brown fox jumped over the lazy dog's back.")

In [2]:
doc.text

"The quick brown fox jumped over the lazy dog's back."

In [3]:
doc[4]

jumped

In [4]:
doc[4].pos_

'VERB'

In [5]:
doc[4].tag_

'VBD'

In [6]:
for token in doc:
    print(f"{token.text:{10}} {token.pos_:{10}} {token.tag_:{10}} {spacy.explain(token.tag_)}")

The        DET        DT         determiner
quick      ADJ        JJ         adjective (English), other noun-modifier (Chinese)
brown      ADJ        JJ         adjective (English), other noun-modifier (Chinese)
fox        NOUN       NN         noun, singular or mass
jumped     VERB       VBD        verb, past tense
over       ADP        IN         conjunction, subordinating or preposition
the        DET        DT         determiner
lazy       ADJ        JJ         adjective (English), other noun-modifier (Chinese)
dog        NOUN       NN         noun, singular or mass
's         PART       POS        possessive ending
back       NOUN       NN         noun, singular or mass
.          PUNCT      .          punctuation mark, sentence closer


In [7]:
doc = nlp(u'I read books on NLP.')
r = doc[1]

print(f'{r.text:{10}} {r.pos_:{8}} {r.tag_:{6}} {spacy.explain(r.tag_)}')

read       VERB     VBP    verb, non-3rd person singular present


In [8]:
doc = nlp(u'I read a book on NLP.')
r = doc[1]

print(f'{r.text:{10}} {r.pos_:{8}} {r.tag_:{6}} {spacy.explain(r.tag_)}')

read       VERB     VBD    verb, past tense


In [9]:
doc = nlp(u"The quick brown fox jumped over the lazy dog's back.")

# Count the frequencies of different coarse-grained POS tags:
POS_counts = doc.count_by(spacy.attrs.POS)
POS_counts

{90: 2, 84: 3, 92: 3, 100: 1, 85: 1, 94: 1, 97: 1}

In [10]:
doc.vocab[84].text

'ADJ'

In [11]:
doc[3].pos_

'NOUN'

In [12]:
for k, v in sorted(POS_counts.items()):
    print(f"{k}. {doc.vocab[k].text:{5}} {v}")

84. ADJ   3
85. ADP   1
90. DET   2
92. NOUN  3
94. PART  1
97. PUNCT 1
100. VERB  1


In [13]:
TAG_counts = doc.count_by(spacy.attrs.TAG)
for k, v in sorted(TAG_counts.items()):
    print(f"{k}. {doc.vocab[k].text:{5}} {v}")

74. POS   1
1292078113972184607. IN    1
10554686591937588953. JJ    3
12646065887601541794. .     1
15267657372422890137. DT    2
15308085513773655218. NN    3
17109001835818727656. VBD   1


In [14]:
len(doc.vocab)

791

In [15]:
DEP_counts = doc.count_by(spacy.attrs.DEP)
for k, v in sorted(DEP_counts.items()):
    print(f"{k}. {doc.vocab[k].text:{5}} {v}")

400. advmod 1
402. amod  3
415. det   2
429. nsubj 1
439. pobj  1
443. prep  1
445. punct 1
8110129090154140942. case  1
8206900633647566924. ROOT  1


In [16]:
import spacy
nlp = spacy.load('en_core_web_sm')

doc = nlp(u"The quick brown fox jumped over the lazy dog's back.")

In [17]:
from spacy import displacy
displacy.render(doc,style='dep')

## NER

In [18]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [19]:
def show_ents(doc):
    if doc.ents:
        for ent in doc.ents:
            print(ent.text + ' - ' + ent.label_ + ' - ' + str(spacy.explain(ent.label_)))
    else:
        print('No entity')

In [20]:
doc = nlp(u"May go to Washingto, DC next May to see the Washingto Monument?")

In [21]:
show_ents(doc)

Washingto, DC - GPE - Countries, cities, states
next May - DATE - Absolute or relative dates or periods
the Washingto Monument - FAC - Buildings, airports, highways, bridges, etc.


In [22]:
doc.ents

(Washingto, DC, next May, the Washingto Monument)

In [23]:
doc = nlp(u'Tesla to build a U.K. factory for $6 million')

show_ents(doc)

U.K. - GPE - Countries, cities, states
$6 million - MONEY - Monetary values, including unit


In [24]:
from spacy.tokens import Span

# Get the hash value of the ORG entity label
ORG = doc.vocab.strings[u'ORG']  

# Create a Span for the new entity
new_ent = Span(doc, 0, 1, label=ORG)

# Add the entity to the existing Doc object
doc.ents = list(doc.ents) + [new_ent]

In [25]:
show_ents(doc)

Tesla - ORG - Companies, agencies, institutions, etc.
U.K. - GPE - Countries, cities, states
$6 million - MONEY - Monetary values, including unit


In [26]:
doc = nlp(u'Our company plans to introduce a new vacuum cleaner. '
          u'If successful, the vacuum cleaner will be our first product.')

show_ents(doc)

first - ORDINAL - "first", "second", etc.


In [27]:
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)

In [28]:
phrase_list = ['vacuum cleaner', 'vacuum-cleaner']
phrase_patterns = [nlp(text) for text in phrase_list]

In [29]:
matcher.add('newproduct',None,*phrase_patterns)

In [30]:
found_matches = matcher(doc)

In [31]:
found_matches

[(2689272359382549672, 7, 9), (2689272359382549672, 14, 16)]

In [32]:
PROD = doc.vocab.strings[u'PRODUCT']

In [33]:
new_ents = [Span(doc,match[1],match[2],label=PROD) for match in found_matches]

In [34]:
new_ents

[vacuum cleaner, vacuum cleaner]

In [35]:
doc.ents = list(doc.ents) + new_ents

In [36]:
doc.ents

(vacuum cleaner, vacuum cleaner, first)

In [37]:
doc = nlp(u'Originally priced at $29.50, the sweater was marked down to five dollars.')

show_ents(doc)

29.50 - MONEY - Monetary values, including unit
five dollars - MONEY - Monetary values, including unit


In [38]:
[ent for ent in doc.ents if ent.label_ == 'MONEY']

[29.50, five dollars]

In [39]:
import spacy
nlp = spacy.load('en_core_web_sm')

# Import the displaCy library
from spacy import displacy

In [40]:
doc = nlp(u'Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million. '
         u'By contrast, Sony sold only 7 thousand Walkman music players.')

displacy.render(doc, style='ent', jupyter=True)

In [41]:
for sent in doc.sents:
    displacy.render(nlp(sent.text),style='ent')

In [42]:
options = {'ents': ['PRODUCT','ORG']}

In [43]:
displacy.render(doc,style='ent',options=options)

In [44]:
doc = nlp(u'This is the first sentence. This is another sentence. This is the last sentence.')

for sent in doc.sents:
    print(sent)

This is the first sentence.
This is another sentence.
This is the last sentence.


## Segmentation

In [45]:
import spacy
nlp = spacy.load('en_core_web_sm')

doc = nlp(u'"This is the first sentence; This is another sentence." This is the last sentence.')


In [46]:
list(doc.sents)

["This is the first sentence; This is another sentence.",
 This is the last sentence.]

In [47]:
for sent in doc.sents:
    print(sent)
    print('\n')

"This is the first sentence; This is another sentence."


This is the last sentence.




In [48]:
# def set_custom_boundaries(doc):
#     for token in doc[:-1]:
#         if token.text == ';':
#             doc[token.i+1].is_sent_start = True
#     return doc

# nlp.add_pipe(set_custom_boundaries, before='parser')

# nlp.pipe_names

In [49]:
def set_custom_boundaries(doc):
    for token in doc[:-1]:
        if token.text == ';':
            doc[token.i+1].is_sent_start = True
    return doc

In [50]:
# # CHANGING THE RULES
# from spacy.pipeline import Sentencizer

# def split_on_newlines(doc):
#     start = 0
#     seen_newline = False
#     for word in doc:
#         if seen_newline:
#             yield doc[start:word.i]
#             start = word.i
#             seen_newline = False
#         elif word.text.startswith('\n'): # handles multiple occurrences
#             seen_newline = True
#     yield doc[start:]      # handles the last group of tokens


# sbd = Sentencizer(nlp.vocab, strategy=split_on_newlines)
# nlp.add_pipe(sbd)

In [51]:
import spacy
nlp = spacy.load('en_core_web_sm')
from spacy import displacy

In [52]:
with open('/kaggle/input/peterrabbit/peterrabbit.txt') as f:
    doc = nlp(f.read())

In [53]:
for token in list(doc.sents)[2]:
    print(f"{token.text:{10}} {token.pos_:{10}} {token.tag_:{10}} {str(spacy.explain(token.tag_))}")

They       PRON       PRP        pronoun, personal
lived      VERB       VBD        verb, past tense
with       ADP        IN         conjunction, subordinating or preposition
their      PRON       PRP$       pronoun, possessive
Mother     NOUN       NN         noun, singular or mass
in         ADP        IN         conjunction, subordinating or preposition
a          DET        DT         determiner
sand       NOUN       NN         noun, singular or mass
-          PUNCT      HYPH       punctuation mark, hyphen
bank       NOUN       NN         noun, singular or mass
,          PUNCT      ,          punctuation mark, comma
underneath ADP        IN         conjunction, subordinating or preposition
the        DET        DT         determiner
root       NOUN       NN         noun, singular or mass
of         ADP        IN         conjunction, subordinating or preposition
a          DET        DT         determiner

          SPACE      _SP        whitespace
very       ADV        RB       

In [54]:
doc.count_by(spacy.attrs.POS)

{90: 90,
 96: 74,
 85: 125,
 97: 171,
 93: 9,
 103: 99,
 86: 63,
 98: 19,
 92: 172,
 95: 110,
 100: 135,
 84: 53,
 89: 61,
 87: 49,
 94: 28}

In [55]:
POS_counts = doc.count_by(spacy.attrs.POS)

for k, v in sorted(POS_counts.items()):
    print(f"{k} {doc.vocab[k].text} {v}")

84 ADJ 53
85 ADP 125
86 ADV 63
87 AUX 49
89 CCONJ 61
90 DET 90
92 NOUN 172
93 NUM 9
94 PART 28
95 PRON 110
96 PROPN 74
97 PUNCT 171
98 SCONJ 19
100 VERB 135
103 SPACE 99


In [56]:
POS_counts[92]/len(doc) * 100

13.672496025437203

In [57]:
displacy.render(list(doc.sents)[2],style='dep')

In [58]:
list(doc.sents)[2]

They lived with their Mother in a sand-bank, underneath the root of a
very big fir-tree.


In [59]:
for ent in doc.ents[:2]:
    print(ent.text + ' ' + ent.label_ + ' ' + str(spacy.explain(ent.label_)))

The Tale of Peter Rabbit WORK_OF_ART Titles of books, songs, etc.
Beatrix Potter PERSON People, including fictional


In [60]:
len(list(doc.sents))

55

In [61]:
list_of_sents = [nlp(sent.text) for sent in doc.sents]
list_of_ners = [doc for doc in list_of_sents if doc.ents]

In [62]:
len(list_of_ners)

36

In [63]:
displacy.render(list_of_sents[0], style='ent')