In [None]:
import spacy

In [None]:
nlp = spacy.load('en_core_web_sm')
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
type(doc)

spacy.tokens.doc.Doc

In [None]:
for token in doc:
  print(token.text, token.pos_, token.dep_, token.is_alpha)
  

# pos: the simple part-of-speech tag, verb, noun,...
# dep: syntactic dependency i.e. relation between tokens

Apple PROPN nsubj True
is AUX aux True
looking VERB ROOT True
at ADP prep True
buying VERB pcomp True
U.K. PROPN compound False
startup NOUN dobj True
for ADP prep True
$ SYM quantmod False
1 NUM compound False
billion NUM pobj True


In [None]:
doc.sentiment

0.0

In [None]:
doc = nlp("Here, we adopt the properties of possibilities and Bayes’ Theorem to do the conversion.")

In [None]:
for token in doc:
  print(token.text, token.lemma_, token.tag_, token.dep_, token.shape_, token.is_stop)

Here here RB advmod Xxxx True
, , , punct , False
we -PRON- PRP nsubj xx True
adopt adopt VBP ROOT xxxx False
the the DT det xxx True
properties property NNS dobj xxxx False
of of IN prep xx True
possibilities possibility NNS pobj xxxx False
and and CC cc xxx True
Bayes Bayes NNP conj Xxxxx False
’ ' '' punct ’ False
Theorem theorem NN ROOT Xxxxx False
to to TO aux xx True
do do VB xcomp xx True
the the DT det xxx True
conversion conversion NN dobj xxxx False
. . . punct . False


In [None]:
from spacy import displacy

In [None]:
displacy.serve(doc)


Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


In [None]:
displacy.serve(doc, style='dep')


Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


In [None]:
# Name entity recognizing

In [None]:
doc2 = nlp("India is the biggest democracy in the world and USA is the second.")

for ent in doc2.ents:
  print(ent.text, ent.start_char, ent.end_char, ent.label_)

India 0 5 GPE
USA 48 51 GPE
second 59 65 ORDINAL


In [None]:
displacy.serve(doc2,style="ent")


Using the 'ent' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


In [None]:
# Word vector and similarity

In [None]:
import spacy.cli
spacy.cli.download("en_core_web_md")

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_md')


In [None]:
import en_core_web_md
nlp = en_core_web_md.load()

In [None]:
spacy.load("en_core_web_md")

<spacy.lang.en.English at 0x7fc5e1e23890>

In [None]:
tokens = nlp("lion bear apple banana qwe123rty")

for token in tokens:
  print(token.text, token.has_vector, token.vector_norm, token.is_oov)

# has_vector: does a token have a vector representation
# oov: out of vocabulary

lion True 6.5120897 False
bear True 5.881604 False
apple True 7.1346846 False
banana True 6.700014 False
qwe123rty False 0.0 True


In [None]:
tokens = nlp("lion bear river apple banana cow")

for token1 in tokens:
  for token2 in tokens:
    print(token1.text, token2.text, token1.similarity(token2))

lion lion 1.0
lion bear 0.6390859
lion river 0.32039484
lion apple 0.33227408
lion banana 0.21994989
lion cow 0.4780627
bear lion 0.6390859
bear bear 1.0
bear river 0.32381308
bear apple 0.3176035
bear banana 0.2628087
bear cow 0.43222302
river lion 0.32039484
river bear 0.32381308
river river 1.0
river apple 0.2081475
river banana 0.18435778
river cow 0.27732792
apple lion 0.33227408
apple bear 0.3176035
apple river 0.2081475
apple apple 1.0
apple banana 0.5831845
apple cow 0.36605674
banana lion 0.21994989
banana bear 0.2628087
banana river 0.18435778
banana apple 0.5831845
banana banana 1.0
banana cow 0.38532063
cow lion 0.4780627
cow bear 0.43222302
cow river 0.27732792
cow apple 0.36605674
cow banana 0.38532063
cow cow 1.0


In [None]:
# Vocab, hashes and lexemes
# Whenever possible, spaCy tries to store data in a vocabulary, the Vocab, that will be shared by multiple 
# documents. To save memory, spaCy also encodes all strings to hash values – 
# in this case for example, “coffee” has the hash 3197928453018144401. 
# Entity labels like “ORG” and part-of-speech tags like “VERB” are also encoded. 
# Internally, spaCy only “speaks” in hash values.

In [None]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("I love coffee")
print(doc.vocab.strings["coffee"])  # 3197928453018144401
print(doc.vocab.strings[3197928453018144401])  # 'coffee'

3197928453018144401
coffee


In [None]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("I love tea, over coffee!")
for word in doc:
    lexeme = doc.vocab[word.text]
    # print(lexeme)
    print(lexeme.text, lexeme.orth, lexeme.shape_, lexeme.prefix_, lexeme.suffix_,
            lexeme.is_alpha, lexeme.is_digit, lexeme.is_title, lexeme.lang_)

I 4690420944186131903 X I I True False True en
love 3702023516439754181 xxxx l ove True False False en
tea 6041671307218480733 xxx t tea True False False en
, 2593208677638477497 , , , False False False en
over 5456543204961066030 xxxx o ver True False False en
coffee 3197928453018144401 xxxx c fee True False False en
! 17494803046312582752 ! ! ! False False False en


In [None]:
from spacy.tokens import Doc
from spacy.vocab import Vocab
nlp = spacy.load('en_core_web_sm')
doc = nlp("I love tea, over coffee!")
print(doc.vocab.strings["tea"])

6041671307218480733


In [None]:
print(doc.vocab.strings[6041671307218480733])

tea


In [None]:
empty_doc = Doc(Vocab())
empty_doc.vocab.strings.add("tea")

6041671307218480733

In [None]:
new_doc = Doc(doc.vocab)
print(new_doc.vocab.strings[6041671307218480733])

tea


In [None]:
# KnowledgeBase
# To support the entity linking task, spaCy stores external knowledge in a KnowledgeBase. 
# The knowledge base (KB) uses the Vocab to store its data efficiently.
# A knowledge base is created by first adding all entities to it. 
# Next, for each potential mention or alias, a list of relevant KB IDs and their prior probabilities is added. 
# The sum of these prior probabilities should never exceed 1 for any given alias.

In [None]:
from spacy.kb import KnowledgeBase

# load the model and create an empty KB
nlp = spacy.load('en_core_web_sm')
kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=3)

# adding entities
kb.add_entity(entity="Q1004791", freq=6, entity_vector=[0, 3, 5])
kb.add_entity(entity="Q42", freq=342, entity_vector=[1, 9, -3])
kb.add_entity(entity="Q5301561", freq=12, entity_vector=[-2, 4, 2])

# adding aliases
kb.add_alias(alias="Douglas", entities=["Q1004791", "Q42", "Q5301561"], probabilities=[0.6, 0.1, 0.2])
kb.add_alias(alias="Douglas Adams", entities=["Q42"], probabilities=[0.9])

print()
print("Number of entities in KB:",kb.get_size_entities()) # 3
print("Number of aliases in KB:", kb.get_size_aliases()) # 2


Number of entities in KB: 3
Number of aliases in KB: 2
