In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
doc = nlp(u'it\'s been a year since i have seen my college. i am missing my college life very badly')

In [4]:
for token in doc:
  print(token)

it
's
been
a
year
since
i
have
seen
my
college
.
i
am
missing
my
college
life
very
badly


In [65]:
print(doc[4].tag) #THIS WILL GIVE THE HASH VALUE OF THE TOKEN

15794550382381185553


In [66]:
print(doc[4].tag_)

NNP


In [67]:
print(doc[4].pos_)

PROPN


In [8]:
for token in doc:
  print(f"{token} \t {token.pos_} \t {token.tag_} \t {spacy.explain(token.tag_)}")

it 	 PRON 	 PRP 	 pronoun, personal
's 	 AUX 	 VBZ 	 verb, 3rd person singular present
been 	 AUX 	 VBN 	 verb, past participle
a 	 DET 	 DT 	 determiner
year 	 NOUN 	 NN 	 noun, singular or mass
since 	 SCONJ 	 IN 	 conjunction, subordinating or preposition
i 	 PRON 	 PRP 	 pronoun, personal
have 	 AUX 	 VBP 	 verb, non-3rd person singular present
seen 	 VERB 	 VBN 	 verb, past participle
my 	 DET 	 PRP$ 	 pronoun, possessive
college 	 NOUN 	 NN 	 noun, singular or mass
. 	 PUNCT 	 . 	 punctuation mark, sentence closer
i 	 PRON 	 PRP 	 pronoun, personal
am 	 AUX 	 VBP 	 verb, non-3rd person singular present
missing 	 VERB 	 VBG 	 verb, gerund or present participle
my 	 DET 	 PRP$ 	 pronoun, possessive
college 	 NOUN 	 NN 	 noun, singular or mass
life 	 NOUN 	 NN 	 noun, singular or mass
very 	 ADV 	 RB 	 adverb
badly 	 ADV 	 RB 	 adverb


NOW WE WILL SEE HOW SPACY RECOGNISES TWO FORMS OF THE SAME WORD. 'READING' WILL BE RECOGNISED AS A PRESENT TENSE VERB WHEREAS READ WILL BE RECOGNISED AS PAST TENSE VERB

In [9]:
doc2 = nlp(u"I am reading a book on NLP")
for token in doc2:
  print(token)

I
am
reading
a
book
on
NLP


In [10]:
word = doc2[2]

In [11]:
token = word
print(f"{token} \t {token.pos_} \t {token.tag_} \t {spacy.explain(token.tag_)}") #SPACY.EXPLAIN IS USED TO EXPLAIN THE TAG 

reading 	 VERB 	 VBG 	 verb, gerund or present participle


In [12]:
doc3 = nlp(u"I read a book on NLP")
for token in doc3:
  print(token)


I
read
a
book
on
NLP


In [13]:
word1 = doc3[1]

In [14]:
token1 = word1
print(f"{token1} \t {token1.pos_} \t {token1.tag_} \t {spacy.explain(token1.tag_)}")

read 	 VERB 	 VBD 	 verb, past tense


In [15]:
doc4 = nlp(u"it\'s been a year since i have seen my college. i am missing my college life very badly")

In [16]:
POS_counts = doc.count_by(spacy.attrs.POS)

In [17]:
POS_counts

{86: 2, 87: 4, 90: 3, 92: 4, 95: 3, 97: 1, 98: 1, 100: 2}

In [18]:
doc.vocab[86].text

'ADV'

In [19]:
doc4[2].pos_

'AUX'

In [20]:
for k,v in sorted(POS_counts.items()):    #TO COUNT THE NO OF POS APPEARING IN THE TEXT 
  print(f"{k}. {doc.vocab[k].text} {v}")

86. ADV 2
87. AUX 4
90. DET 3
92. NOUN 4
95. PRON 3
97. PUNCT 1
98. SCONJ 1
100. VERB 2


In [21]:
DEP_counts = doc.count_by(spacy.attrs.DEP)

for k,v in sorted(DEP_counts.items()):
  print(f"{k}. {doc.vocab[k].text} {v}")

399. advcl 1
400. advmod 2
404. attr 1
405. aux 2
406. auxpass 1
415. det 1
416. dobj 2
423. mark 1
429. nsubj 2
430. nsubjpass 1
440. poss 2
445. punct 1
7037928807040764755. compound 1
8206900633647566924. ROOT 2


In [22]:
from spacy import displacy #DISPLACY IS A VIZUALISING TOOL FROM SPACY

In [23]:
doc5 = nlp(u"I scream, you scream, we all scream for ice cream")

In [24]:
spacy.displacy.serve(doc5,style='dep',page=True,port=5000) 


Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


#NAMED ENTITY RECOGNITION
TO IDENTIFY THE NAMED ENTITIES PRESENT IN THE DOCUMENT.
THESE CAN BE NAME OF A PERSON, ORGANIZATION, CARS, PRODUCTS AND MANY MORE...

In [25]:
def show_ents(doc):
  if doc.ents:
    for ent in doc.ents:
      print(ent.text + ' - ' +ent.label_ + ' - ' + str(spacy.explain(ent.label_)))
  else:
    print('No entities found')    


In [26]:
doc = nlp(u'Hi How are you?')

In [27]:
show_ents(doc)  #NO NAMED ENTITY

No entities found


In [28]:
doc = nlp(u'I am going to Ranchi next month as Birla Institute of Technology, Mesra is going to reopen')

In [68]:
show_ents(doc) #AS WE CAN SEE HERE ALL THE NAMED ENTITIES ARE CORRECTLY CLASSIFIED

Tesla - PRODUCT - Objects, vehicles, foods, etc. (not services)
mercedes - ORG - Companies, agencies, institutions, etc.
Audi Q4 - PERSON - People, including fictional


In [30]:
doc = nlp(u'I will need to have Rupees 5000 to register in the flagship event') #'MONEY' ENTITY

In [31]:
show_ents(doc)

5000 - CARDINAL - Numerals that do not fall under another type


In [32]:
doc = nlp(u'I like Tesla and mercedes maybach but Audi Q4 is good as well!!')

In [33]:
show_ents(doc)

Tesla - PRODUCT - Objects, vehicles, foods, etc. (not services)
mercedes - ORG - Companies, agencies, institutions, etc.
Audi Q4 - PERSON - People, including fictional


In [34]:
from spacy.tokens import Span #SPAN IS SLICE OF THE DOCUMENT. FROM SPAN WE CAN ANALYSE A WORD OR A SEQUENCE OF WORDS FROM THE DOCUMENT

SUPPOSE WE WANT TO ADD THE NAMED ENTITY IN THE VOCAB HOW CAN WE DO THIS?
HERE 'TESLA' IS NOT INCLUDED AS THE ORGANIZATION NAME IN THE VOCAB AND WE ARE GOING TO ADD IT AS AN ENTITY UNDER 'ORGANIZATION'

In [35]:
doc6 = nlp(u'Tesla to build a factory of worth $6 million')

In [36]:
ORG = doc6.vocab.strings[u"ORG"]

In [37]:
ORG

383

In [38]:
new_entity = Span(doc6,0,1,label=ORG) #HERE WE CREATED A NEW ENTITYAND LABELLED IT AS ORG

In [39]:
doc6.ents = list(doc6.ents) + [new_entity]

In [69]:
show_ents(doc6) #SO NOW HERE WE ARE ABLE TO SEE 'TESLA' AS THE ORG NAME

Tesla - ORG - Companies, agencies, institutions, etc.
$6 million - MONEY - Monetary values, including unit


In [41]:
doc7 = nlp(u"Our company has launched a new vaccum cleaner in the market today."
          u"The launched vaccum-cleaner is the best product available")

In [42]:
show_ents(doc7)

today - DATE - Absolute or relative dates or periods


In [43]:
from spacy.matcher import PhraseMatcher  #PHRASE MATCHER IS USED WHEN WE WANT TO MATCH OR COMPARE MULTIPLE WORDS

In [44]:
matcher = PhraseMatcher(nlp.vocab)

In [45]:
phrase_list = ['vaccum cleaner', 'vaccum-cleaner']

In [46]:
phrase_pattern = [nlp(text) for text in phrase_list]

In [47]:
matcher.add('newproduct', None, *phrase_pattern)

In [48]:
found_matches =  matcher(doc7)

In [49]:
found_matches

[(2689272359382549672, 6, 8), (2689272359382549672, 15, 18)]

In [50]:
from spacy.tokens import Span

In [51]:
PROD = doc7.vocab.strings[u"PRODUCT"]

In [52]:
new_ents=[Span(doc7,match[1],match[2],label=PROD) for match in found_matches]

In [53]:
doc7.ents = list(doc7.ents) + new_ents

In [54]:
show_ents(doc7)

vaccum cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)
today - DATE - Absolute or relative dates or periods
vaccum-cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)


In [62]:
doc8 = nlp(u"Getting admission in a private medical college will cost you will need  $25 lakh Rupees and same in the government college will cost you around 5 lakh dollars")

In [64]:
len([ent for ent in doc8.ents if ent.label_ == 'MONEY']) 

2