## Named Entity Recognition

In [1]:
s1 = "Apple is looking at buying U.K. startup for $1 billion"
s2 = "San Francisco considers banning sidewalk delivery robots"
s3 = "facebook is hiring a new vice president in U.S."
s4 = 'shubham is my name'

In [2]:
# Importing Spacy
import spacy

nlp = spacy.load(name= 'en_core_web_sm')

In [3]:
doc1 = nlp(s1)
for ent in doc1.ents:
  print(ent.text, ent.label_, str(spacy.explain(ent.label_)))

Apple ORG Companies, agencies, institutions, etc.
U.K. GPE Countries, cities, states
$1 billion MONEY Monetary values, including unit


In [4]:
doc2 = nlp(s2)
for ent in doc2.ents:
  print(ent.text, ent.label_, str(spacy.explain(ent.label_)))

San Francisco GPE Countries, cities, states


In [5]:
# print(s3)
doc3 = nlp(s3)
for ent in doc3.ents:
  print(ent.text, ent.label_, str(spacy.explain(ent.label_)))

U.S. GPE Countries, cities, states


In [6]:
ORG = doc3.vocab.strings['ORG']

In [7]:
from spacy.tokens import Span
new_ent = Span(doc3, 0, 1, label = ORG)

In [8]:
doc3.ents = list(doc3.ents) + [new_ent]

In [9]:
doc3.ents

(facebook, U.S.)

In [10]:
# print(s3)
for ent in doc3.ents:
  print(ent.text, ent.label_, str(spacy.explain(ent.label_)))

facebook ORG Companies, agencies, institutions, etc.
U.S. GPE Countries, cities, states


In [11]:
# print(s3)
doc4 = nlp(s4)
for ent in doc4.ents:
  print(ent.text, ent.label_, str(spacy.explain(ent.label_)))

shubham PERSON People, including fictional


In [12]:
from spacy import displacy

In [13]:
displacy.render(docs = doc1, style= 'ent', jupyter=True)

In [14]:
displacy.render(docs = doc1, style= 'ent',options={'ents': ['ORG']}, jupyter=True)

In [15]:
displacy.render(docs = doc3, style= 'ent', jupyter=True)

In [16]:
displacy.render(docs = doc4, style= 'ent', jupyter=True)

## Sentence Segmentation

In [17]:
s1 = "This is a sentence. This is second sentence. This is last sentence."
s2 = "This is a sentence; This is second sentence; This is last sentence."

In [18]:
doc1 = nlp(s1)

In [19]:
for sent in doc1.sents:
  print(sent.text)

This is a sentence.
This is second sentence.
This is last sentence.


In [20]:
s3 = "This is a sentence. This is second U.K. sentence. This is last sentence."

In [21]:
doc3 = nlp(s3)
for sent in doc3.sents:
  print(sent.text)

This is a sentence.
This is second U.K. sentence.
This is last sentence.


In [22]:
doc2 = nlp(s2)
for sent in doc2.sents:
  print(sent.text)

This is a sentence; This is second sentence; This is last sentence.


In [23]:
s2

'This is a sentence; This is second sentence; This is last sentence.'

In [25]:
from spacy.language import Language
@Language.component("component")
def set_custom_boundaries(doc):
    for token in doc[:-1]:   
      if token.text == ';':    
        # print(token.i)
        doc[token.i+1].is_sent_start = True
    return doc

In [26]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [27]:
nlp.add_pipe('component', before='parser')

<function __main__.set_custom_boundaries(doc)>

In [28]:
doc_2 = nlp(s2)
for sent in doc_2.sents:
  print(sent.text)

This is a sentence;
This is second sentence;
This is last sentence.
