**Linguistic annotations**

In [5]:
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

In [6]:
for token in doc:
    print(token.pos_,token.dep_)

PROPN nsubj
AUX aux
VERB ROOT
ADP prep
VERB pcomp
PROPN compound
NOUN dobj
ADP prep
SYM quantmod
NUM compound
NUM pobj


**Tokenization**

In [8]:
for token in doc:
    print(token.text)

Apple
is
looking
at
buying
U.K.
startup
for
$
1
billion


**Part-of-speech tags and dependencies**

In [9]:
for token in doc:
    print(token.text,token.lemma_,token.pos_,token.tag_,token.dep_,token.shape_,token.is_alpha,token.is_stop)

Apple Apple PROPN NNP nsubj Xxxxx True False
is be AUX VBZ aux xx True True
looking look VERB VBG ROOT xxxx True False
at at ADP IN prep xx True True
buying buy VERB VBG pcomp xxxx True False
U.K. U.K. PROPN NNP compound X.X. False False
startup startup NOUN NN dobj xxxx True False
for for ADP IN prep xxx True True
$ $ SYM $ quantmod $ False False
1 1 NUM CD compound d False False
billion billion NUM CD pobj xxxx True False


**Named Entities**

In [10]:
for ent in doc.ents:
    print(ent.text,ent.start_char,ent.end_char,ent.label_)

Apple 0 5 ORG
U.K. 27 31 GPE
$1 billion 44 54 MONEY


**Word vectors and similarity**

In [12]:
tokens = nlp("dog cat banana afskfsd")
for token in tokens:
    print(token.text,token.has_vector,token.vector_norm,token.is_oov)

dog True 17.564213 True
cat True 18.028578 True
banana True 15.8090315 True
afskfsd True 18.056723 True


In [13]:
tokens = nlp("cat dog banana")
for token1 in tokens:
    for token2 in tokens:
        print(token1.text,tokens.text,token1.similarity(token2))

cat cat dog banana 1.0
cat cat dog banana 0.51242375
cat cat dog banana 0.45876315
dog cat dog banana 0.51242375
dog cat dog banana 1.0
dog cat dog banana 0.41062877
banana cat dog banana 0.45876315
banana cat dog banana 0.41062877
banana cat dog banana 1.0


  after removing the cwd from sys.path.


**Vocab, hashes and lexemes**

In [15]:
doc = nlp("I Love coffee")
print(doc.vocab.strings["coffee"])
print(doc.vocab.strings[3197928453018144401])


3197928453018144401
coffee


In [16]:
for word in doc:
    lexeme = doc.vocab[word.text]
    print(lexeme.text, lexeme.orth, lexeme.shape_, lexeme.prefix_, lexeme.suffix_,
            lexeme.is_alpha, lexeme.is_digit, lexeme.is_title, lexeme.lang_)

I 4690420944186131903 X I I True False True en
Love 13599639812707930908 Xxxx L ove True False True en
coffee 3197928453018144401 xxxx c fee True False False en


In [23]:
import spacy
from spacy.tokens import Doc
from spacy.vocab import Vocab

nlp = spacy.load("en_core_web_sm")
doc = nlp("I love coffee")  # Original Doc
print(doc.vocab.strings["coffee"])  # 3197928453018144401
print(doc.vocab.strings[3197928453018144401])  # 'coffee' 👍

empty_doc = Doc(Vocab())  # New Doc with empty Vocab
# empty_doc.vocab.strings[3197928453018144401] will raise an error :(

empty_doc.vocab.strings.add("coffee")  # Add "coffee" and generate hash
print(empty_doc.vocab.strings[3197928453018144401])  # 'coffee' 

new_doc = Doc(doc.vocab)  # Create new doc with first doc's vocab
print(new_doc.vocab.strings[3197928453018144401])  # 'coffee' 

3197928453018144401
coffee
coffee
coffee
