In [4]:
!pip install -U spacy

Collecting spacy
  Downloading spacy-3.5.1-cp39-cp39-win_amd64.whl (12.2 MB)
Collecting spacy-loggers<2.0.0,>=1.0.0
  Downloading spacy_loggers-1.0.4-py3-none-any.whl (11 kB)
Collecting srsly<3.0.0,>=2.4.3
  Downloading srsly-2.4.6-cp39-cp39-win_amd64.whl (482 kB)
Collecting smart-open<7.0.0,>=5.2.1
  Downloading smart_open-6.3.0-py3-none-any.whl (56 kB)
Collecting thinc<8.2.0,>=8.1.8
  Downloading thinc-8.1.9-cp39-cp39-win_amd64.whl (1.5 MB)
Collecting cymem<2.1.0,>=2.0.2
  Downloading cymem-2.0.7-cp39-cp39-win_amd64.whl (30 kB)
Collecting pathy>=0.10.0
  Downloading pathy-0.10.1-py3-none-any.whl (48 kB)
Collecting catalogue<2.1.0,>=2.0.6
  Downloading catalogue-2.0.8-py3-none-any.whl (17 kB)
Collecting wasabi<1.2.0,>=0.9.1
  Downloading wasabi-1.1.1-py3-none-any.whl (27 kB)
Collecting murmurhash<1.1.0,>=0.28.0
  Downloading murmurhash-1.0.9-cp39-cp39-win_amd64.whl (18 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11
  Downloading spacy_legacy-3.0.12-py2.py3-none-any.whl (29 kB)
Collecting 

In [5]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.5.0
[38;5;2m[+] Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


# Linguistic Annotations

In [1]:
import spacy

### Tokenization, Lemmatization, stopwords

In [2]:
# Downloading a trained pipeline using "spacy.load" it returns a language object.
nlp = spacy.load("en_core_web_sm")
text = "Apple is looking at buying U.K. startups for $1 billion."
doc = nlp(text)
for token in doc:
    print(f"token: {token.text : >8}, token_type: {type(token)}, token_lemma: {token.lemma_: >8}, token_alpha: {token.is_alpha:3}, token_is_stop: {token.is_stop}")

token:    Apple, token_type: <class 'spacy.tokens.token.Token'>, token_lemma:    Apple, token_alpha:   1, token_is_stop: False
token:       is, token_type: <class 'spacy.tokens.token.Token'>, token_lemma:       be, token_alpha:   1, token_is_stop: True
token:  looking, token_type: <class 'spacy.tokens.token.Token'>, token_lemma:     look, token_alpha:   1, token_is_stop: False
token:       at, token_type: <class 'spacy.tokens.token.Token'>, token_lemma:       at, token_alpha:   1, token_is_stop: True
token:   buying, token_type: <class 'spacy.tokens.token.Token'>, token_lemma:      buy, token_alpha:   1, token_is_stop: False
token:     U.K., token_type: <class 'spacy.tokens.token.Token'>, token_lemma:     U.K., token_alpha:   0, token_is_stop: False
token: startups, token_type: <class 'spacy.tokens.token.Token'>, token_lemma:  startup, token_alpha:   1, token_is_stop: False
token:      for, token_type: <class 'spacy.tokens.token.Token'>, token_lemma:      for, token_alpha:   1, token_i

On passing a document to the spacy object, it automatically performs word tokenization on the text as shown above.


### Parts-of-speech tagging (POS)
**REMARK**: Top perform Parts of Speech(POS) tagging spaCy uses a trained pipeline with `parser` capability

In [3]:
# nlp = spacy.load("en_core_web_sm")
# text = "Apple is looking at buying U.K. startups for $1 billion."
# doc = nlp(text)
for token in doc:
    print(f"token: {token.text : >8}, token_pos: {token.pos_:>6}, token_dep: {token.dep_:>8}, token_shape: {token.shape_:7}")

token:    Apple, token_pos:  PROPN, token_dep:    nsubj, token_shape: Xxxxx  
token:       is, token_pos:    AUX, token_dep:      aux, token_shape: xx     
token:  looking, token_pos:   VERB, token_dep:     ROOT, token_shape: xxxx   
token:       at, token_pos:    ADP, token_dep:     prep, token_shape: xx     
token:   buying, token_pos:   VERB, token_dep:    pcomp, token_shape: xxxx   
token:     U.K., token_pos:  PROPN, token_dep: compound, token_shape: X.X.   
token: startups, token_pos:   NOUN, token_dep:     dobj, token_shape: xxxx   
token:      for, token_pos:    ADP, token_dep:     prep, token_shape: xxx    
token:        $, token_pos:    SYM, token_dep: quantmod, token_shape: $      
token:        1, token_pos:    NUM, token_dep: compound, token_shape: d      
token:  billion, token_pos:    NUM, token_dep:     pobj, token_shape: xxxx   
token:        ., token_pos:  PUNCT, token_dep:    punct, token_shape: .      


After tokenizaion spacy processes the document using its inbuilt pipeline and assigns various linguistic annotations to each token.

### Dependency parsing

In [4]:
from spacy import displacy

In [5]:
displacy.render(doc, style="dep")

### Named Entity Recognitation
**Remark**: This functionality requires a trained pipeline that has capability for `ner`

In [7]:
# Named entities are stores in 'ents' attribute of spacy trained pipeline
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Apple 0 5 ORG
U.K. 27 31 GPE
$1 billion 45 55 MONEY


In [8]:
displacy.render(doc, style="ent")

 ### Word vectors and similarity
 **Remarks**: To use this functionality, spaCy needs a trained pipeline that supports the following capability: `vector`
 
 Word vectors can be generated using alorithms like `word2vec`

In [10]:
for token in doc:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov)

Apple True 8.649501 True
is True 9.339922 True
looking True 7.940327 True
at True 7.695929 True
buying True 8.712863 True
U.K. True 8.170022 True
startups True 6.3557467 True
for True 8.6441765 True
$ True 11.0081215 True
1 True 12.593216 True
billion True 8.605671 True
. True 11.049248 True


In [11]:
token.vector

array([-1.1132902 , -0.6667944 ,  0.06082571, -1.6785738 ,  0.31087735,
        2.0408597 ,  0.47645405, -0.16482933, -0.1333046 , -0.40678078,
        1.7571216 , -0.78491527,  0.11402513,  2.5644388 , -0.30226463,
        1.0801247 , -1.3455181 , -0.2968004 ,  1.0484661 , -0.6410197 ,
        0.62300646, -0.6683191 ,  1.1679491 , -0.56523985,  1.8794154 ,
        3.114655  ,  0.56089675,  0.04845662, -0.4872135 ,  1.6438191 ,
        0.30530605,  1.487636  ,  2.0936441 , -0.3409445 , -0.54809034,
       -0.2635536 , -0.3225018 , -0.12358388, -0.8403523 ,  1.4672818 ,
        2.0713353 , -0.3664598 ,  0.21767265, -0.89840335,  0.13178311,
       -0.8630082 , -0.49971142, -0.12657723,  1.5313015 , -0.6017082 ,
        3.6392012 , -1.2922945 , -0.32340342,  0.3206504 ,  0.6304056 ,
       -1.6455086 , -0.8981062 , -1.0857716 ,  0.9761398 , -0.26554632,
        0.4065959 , -1.9962384 , -0.15032046,  1.3527875 ,  0.97427845,
        0.13042423, -0.5977704 ,  0.47596082, -0.52887297,  0.20

In [14]:
doc1 = nlp("I like salty fries and hamburgers.")
doc2 = nlp("Fast food tastes very good.")

# Similarity of 2 documents 
print(doc1, "<->", doc2, doc1.similarity(doc2))

I like salty fries and hamburgers. <-> Fast food tastes very good. 0.36760004863009244


  print(doc1, "<->", doc2, doc1.similarity(doc2))


## Pipelines

In [17]:
text = "Apple is looking at buying U.K. startups for $1 billion."
doc = nlp(text)

In [18]:
type(nlp)

spacy.lang.en.English

In [19]:
type(doc)

spacy.tokens.doc.Doc