In [32]:
#import the English language class
import spacy
from spacy.lang.en import English 

In [33]:
nlp=spacy.load("en_core_web_sm")

Noun Chunks


In [34]:
doc=nlp(u"Autonoumous cars shift insurence liability towards manufactures")
for chunk in doc.noun_chunks:
  print(chunk.text)

Autonoumous cars
insurence liability
manufactures


In [35]:
doc1=nlp("Red car do not carry higher insurance rates")
for chunk in doc1.noun_chunks:
  print(chunk.text)

Red car
higher insurance rates


In [36]:
doc2=nlp("He was a one eyed one hundered flying purple people-eater")
for chunk in doc2.noun_chunks:
  print(chunk.text)

He
purple people


Built-in Visualizers

In [37]:
from spacy import displacy

In [38]:
doc=nlp("Apple is going to build  a U.K. factory for $6 million ")
displacy.render(doc,style="dep",jupyter=True,options={"distance":110})

In [39]:
doc=nlp("over the last quarter Apple sold nearly 20 thousand ipods for a profit of %6 million")
displacy.render(doc,style='ent',jupyter=True)

In [40]:
#stops words 
#Performs standard imports
nlp=spacy.load('en_core_web_sm')
#Print the set of spacy default's stop words (remember that sets are unordered)
print(nlp.Defaults.stop_words)

{'on', 'more', 'between', 'anywhere', 'already', 'than', 'as', 'doing', 'per', 'either', 'first', 'across', 'his', 'together', 'everything', 'whether', 'fifty', 'they', 'your', 'if', 'herein', 'after', 'nine', 'itself', '‘s', 'must', "'ve", 'amongst', 'here', '‘d', 'not', 'put', 'them', 'hence', 'why', 'besides', 'will', 'neither', 'yours', 'someone', 'side', 'nobody', 'or', 'and', 'via', 'its', 'name', 'before', 'once', '’ll', 'some', "'d", 'both', 'several', 'everyone', 'less', 'formerly', 'within', 'sixty', 'by', 'anyhow', 'still', 'take', 'six', 'however', 'us', 'to', 'whither', 'three', 'go', 'always', 'eight', 'become', 'upon', 'do', 'get', 'made', 'wherever', 'wherein', 'cannot', 're', 'eleven', 'hundred', 'beside', 'may', 'same', 'latter', 'how', 'was', '‘re', "'s", 'while', "'m", 'should', 'afterwards', 'moreover', 'is', 'fifteen', 'any', 'five', 'in', 'very', 'yet', 'even', 'others', 'would', 'her', 'n’t', 'well', 'mostly', 'beforehand', 'am', "n't", 'does', 'anyway', 'what',

In [41]:
#Add the stop words in the set of stop words. use lowercase!
nlp.Defaults.stop_words.add('btw')
#Set the stop_words tag on the lexeme
nlp.vocab["ntw"].is_stop=True

In [42]:
#checking
nlp.vocab['btw'].is_stop

True

In [43]:
nlp.vocab['mystery'].is_stop

False

In [46]:
#Add the word to the set of the words . use lowercase
nlp.Defaults.stop_words.add("btw")

#checking
nlp.vocab['btw'].is_stop


True


Part of Speech(POS)


In [None]:
#create a simple doc object
doc=nlp(u"the quick brown fox jumped over the lazy dog's back")
for token in doc:
  print(f"{token.text:{10}}{token.pos_:{8}}{token.tag_:{6}}{spacy.explain(token.tag_)}")

working with POS

In [None]:
doc=nlp('I read a books on NLP')
r=doc[1]
print(f'{r.text:{10}}{r.pos_:{8}}{r.tag_:{6}}{spacy.explain(r.tag_)}')

In [None]:
doc=nlp(u"I read books on NLP")
r=doc[1]
print(f"{r.text:{10}}{r.pos_:{8}}{r.tag_:{6}}{spacy.explain(r.tag_)}")

Counting POS Tags

In [None]:
#count the different frequencies of different coarse-gained POS tags
pos_counts=doc.count_by(spacy.attrs.POS)
pos_counts

In [None]:
doc.vocab[84].text

In [None]:
for k, v in sorted(pos_counts.items()):
  print(f'{k}.{doc.vocab[k].text:{5}}:{v}')

In [None]:
#Counts the different fine-gained tags

tag_counts=doc.count_by(spacy.attrs.TAG)
for k,v in sorted(tag_counts.items()):
  print(f"{k}.{doc.vocab[k].text:{4}}:{4}")

Named Entities

In [None]:
#write a function to display basic entity info
def show_ents(doc):
  if doc.ents:
    for ent in doc.ents:
      print(ent.text+"-"+ent.label_+"-"+str(spacy.explain(ent.label_)))
  else:
    print("No named entities found")

In [None]:
doc=nlp(u"May I go to washington ,DC next May to see the washington Monument")
show_ents(doc)

In [51]:
doc=nlp(u"can i please borrow 500 dollors from you to  buy sum Microsoft stocks")
for ent in doc.ents:
  print(ent.text,ent.start,ent.end,ent.start_char,ent.end_char,ent.label_)

500 4 5 20 23 CARDINAL
Microsoft 12 13 53 62 ORG


In [52]:
#adding the name entity 
doc=nlp(u"Tesla to build a UK factory for $ billion")
show_ents(doc)

UK-GPE-Countries, cities, states
billion-MONEY-Monetary values, including unit


In [57]:
#question what missing
from spacy.tokens import Span
#get the hash value of the ORG entity label
ORG=doc.vocab.strings[u"ORG"]
#create a span for the new entity 
new_ent=Span(doc,0,1,label=ORG)
#Add the entity to the existing Doc object
doc.ents=list(doc.ents)+[new_ent]

In [58]:
doc=nlp(u"our company plans to introduce a new vaccum cleaner "u"if succesful the vaccum cleaner will be our first product")
show_ents(doc)

vaccum-ORG-Companies, agencies, institutions, etc.
vaccum-ORG-Companies, agencies, institutions, etc.
first-ORDINAL-"first", "second", etc.


Counting NER

In [63]:
doc=nlp(u"originally priced at $29.9 ,the sweater was marked down to five  dollars")
show_ents(doc)

29.9-MONEY-Monetary values, including unit
five  dollars-MONEY-Monetary values, including unit


In [64]:
len([ent for ent in doc.ents if ent.label_=="MONEY"])

2

Noun chunks

In [67]:
doc=nlp(u"Autonomous cars shift insurence liability towards manufactures")
for chunk in doc.noun_chunks:
  print(chunk.text+'-'+chunk.root.text+'-'+chunk.root.dep_+'_'+chunk.root.head.text)

Autonomous cars-cars-nsubj_shift
insurence liability-liability-dobj_shift
manufactures-manufactures-pobj_towards
