In [7]:
#reference: https://nlpforhackers.io/complete-guide-to-spacy/

# installing the spacy

#!pip install -U spacy

#!python -m spacy download en
#import spacy

In [1]:
import spacy

nlp = spacy.load('en')
doc = nlp('Hello      World!')
for token in doc:
    print('"' + token.text + '"')

"Hello"
"     "
"World"
"!"


In [2]:
for token in doc:
    print('"' + token.text + '"', token.idx)

"Hello" 0
"     " 6
"World" 11
"!" 16


In [4]:
doc = nlp("Next week I'll   be in Madrid.")
for token in doc:
    print("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}".format(
        token.text,
        token.idx,
        token.lemma_,
        token.is_punct,
        token.is_space,
        token.shape_,
        token.pos_,
        token.tag_
    ))

Next	0	next	False	False	Xxxx	ADJ	JJ
week	5	week	False	False	xxxx	NOUN	NN
I	10	-PRON-	False	False	X	PRON	PRP
'll	11	will	False	False	'xx	VERB	MD
  	15	  	False	True	  	SPACE	_SP
be	17	be	False	False	xx	VERB	VB
in	20	in	False	False	xx	ADP	IN
Madrid	23	madrid	False	False	Xxxxx	PROPN	NNP
.	29	.	True	False	.	PUNCT	.


# Sentence detection

In [5]:
doc = nlp("These are apples. these are oranges.")
for sent in doc.sents:
    print(sent)

These are apples.
these are oranges.


# Part Of Speech Tagging

In [6]:
doc = nlp("Next week I'll be in Madrid.")
print([(token.text, token.tag_) for token in doc])

[('Next', 'JJ'), ('week', 'NN'), ('I', 'PRP'), ("'ll", 'MD'), ('be', 'VB'), ('in', 'IN'), ('Madrid', 'NNP'), ('.', '.')]


# Named Entity Recognition

In [8]:
doc = nlp("Next week I'll be in Madrid.")
for ent in doc.ents:
    print(ent.text, ent.label_)

Next week DATE
Madrid GPE


In [11]:
doc.ents

(Next week, Madrid)

In [12]:
from nltk.chunk import conlltags2tree
 
doc = nlp("Next week I'll be in Madrid.")
iob_tagged = [
    (
        token.text, 
        token.tag_, 
        "{0}-{1}".format(token.ent_iob_, token.ent_type_) if token.ent_iob_ != 'O' else token.ent_iob_
    ) for token in doc
]
 
print(iob_tagged)
 
# In case you like the nltk.Tree format
print(conlltags2tree(iob_tagged))

[('Next', 'JJ', 'B-DATE'), ('week', 'NN', 'I-DATE'), ('I', 'PRP', 'O'), ("'ll", 'MD', 'O'), ('be', 'VB', 'O'), ('in', 'IN', 'O'), ('Madrid', 'NNP', 'B-GPE'), ('.', '.', 'O')]
(S
  (DATE Next/JJ week/NN)
  I/PRP
  'll/MD
  be/VB
  in/IN
  (GPE Madrid/NNP)
  ./.)


In [13]:
doc = nlp("I just bought 2 shares at 9 a.m. because the stock went up 30% in just 2 days according to the WSJ")
for ent in doc.ents:
    print(ent.text, ent.label_)

2 CARDINAL
9 a.m. TIME
30% PERCENT
just 2 days DATE
WSJ ORG


In [33]:
from spacy import displacy
 
doc = nlp('I just bought 2 shares at 9 a.m. because the stock went up 30% in just 2 days according to the WSJ')
displacy.render(doc, style='ent', jupyter=True)

# Chunking
spaCy automatically detects noun-phrases as well:

In [17]:
doc = nlp("Wall Street Journal just published an interesting piece on crypto currencies")
for chunk in doc.noun_chunks:
    print(chunk.text, chunk.label_, chunk.root.text)
 

Wall Street Journal NP Journal
an interesting piece NP piece
crypto currencies NP currencies


# Dependency Parsing
This is what makes spaCy really stand out. Let’s see the dependency parser in action:

In [18]:
doc = nlp('Wall Street Journal just published an interesting piece on crypto currencies')
 
for token in doc:
    print("{0}/{1} <--{2}-- {3}/{4}".format(
        token.text, token.tag_, token.dep_, token.head.text, token.head.tag_))
 

Wall/NNP <--compound-- Street/NNP
Street/NNP <--compound-- Journal/NNP
Journal/NNP <--nsubj-- published/VBD
just/RB <--advmod-- published/VBD
published/VBD <--ROOT-- published/VBD
an/DT <--det-- piece/NN
interesting/JJ <--amod-- piece/NN
piece/NN <--dobj-- published/VBD
on/IN <--prep-- piece/NN
crypto/JJ <--compound-- currencies/NNS
currencies/NNS <--pobj-- on/IN


In [19]:
from spacy import displacy
 
doc = nlp('Wall Street Journal just published an interesting piece on crypto currencies')
displacy.render(doc, style='dep', jupyter=True, options={'distance': 90})
 

# Word Vectors
spaCy comes shipped with a Word Vector model as well. We’ll need to download a larger model for that:

In [20]:
#python -m spacy download en_core_web_lg

nlp = spacy.load('en_core_web_lg')
print(nlp.vocab['banana'].vector)
 

[ 2.0228e-01 -7.6618e-02  3.7032e-01  3.2845e-02 -4.1957e-01  7.2069e-02
 -3.7476e-01  5.7460e-02 -1.2401e-02  5.2949e-01 -5.2380e-01 -1.9771e-01
 -3.4147e-01  5.3317e-01 -2.5331e-02  1.7380e-01  1.6772e-01  8.3984e-01
  5.5107e-02  1.0547e-01  3.7872e-01  2.4275e-01  1.4745e-02  5.5951e-01
  1.2521e-01 -6.7596e-01  3.5842e-01 -4.0028e-02  9.5949e-02 -5.0690e-01
 -8.5318e-02  1.7980e-01  3.3867e-01  1.3230e-01  3.1021e-01  2.1878e-01
  1.6853e-01  1.9874e-01 -5.7385e-01 -1.0649e-01  2.6669e-01  1.2838e-01
 -1.2803e-01 -1.3284e-01  1.2657e-01  8.6723e-01  9.6721e-02  4.8306e-01
  2.1271e-01 -5.4990e-02 -8.2425e-02  2.2408e-01  2.3975e-01 -6.2260e-02
  6.2194e-01 -5.9900e-01  4.3201e-01  2.8143e-01  3.3842e-02 -4.8815e-01
 -2.1359e-01  2.7401e-01  2.4095e-01  4.5950e-01 -1.8605e-01 -1.0497e+00
 -9.7305e-02 -1.8908e-01 -7.0929e-01  4.0195e-01 -1.8768e-01  5.1687e-01
  1.2520e-01  8.4150e-01  1.2097e-01  8.8239e-02 -2.9196e-02  1.2151e-03
  5.6825e-02 -2.7421e-01  2.5564e-01  6.9793e-02 -2

In [25]:
from scipy import spatial

cosine_similarity = lambda x, y: 1 - spatial.distance.cosine(x,y)

man = nlp.vocab['man'].vector
woman = nlp.vocab['woman'].vector
queen = nlp.vocab['queen'].vector
king = nlp.vocab['king'].vector

# we now need to find the closest vector in vocabulary to the result of 
# "man" - "woman" + "queen"
maybe_king = man - woman + queen
computed_similarities = []

for word in nlp.vocab:
    #Ignore word without vectore
    if not word.has_vector:
        continue
    
    similarity = cosine_similarity(maybe_king, word.vector)
    computed_similarities.append((word, similarity))
    
computed_similarities = sorted(computed_similarities, key=lambda item: -item[1])
print([w[0].text for w in computed_similarities[:10]])

['Queen', 'QUEEN', 'queen', 'King', 'KING', 'king', 'KIng', 'Kings', 'KINGS', 'kings']


# Computing Similarity

In [26]:
banana = nlp.vocab['banana']
dog = nlp.vocab['dog']
fruit = nlp.vocab['fruit']
animal = nlp.vocab['animal']

print(dog.similarity(animal), dog.similarity(fruit))
print(banana.similarity(fruit), banana.similarity(animal))

0.66185343 0.2355285
0.67148364 0.24272852


In [27]:
target = nlp("Cats are beautiful animals.")

doc1 = nlp("Dogs are awesome.")
doc2 = nlp("Some gorgeous creatures are felines.")
doc3 = nlp("Dolphins are swimming animals.")

print(target.similarity(doc1))
print(target.similarity(doc2))
print(target.similarity(doc3))

0.8901765218466683
0.9115828449161616
0.8588322588373283


# Extending spaCy

Creating Document level Extension

In [29]:
import spacy
from spacy.tokens import Doc
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sentance_analyzer = SentimentIntensityAnalyzer()
def polarity_scores(doc):
    return sentance_analyzer.polarity_scores(doc.text)

Doc.set_extension('polarity_scores', getter=polarity_scores)

nlp = spacy.load('en')
doc = nlp("Really whaat event apple nice! it!")
print(doc._.polarity_scores)


{'neg': 0.0, 'neu': 0.596, 'pos': 0.404, 'compound': 0.5242}


In [30]:
nlp = spacy.load("en")
print(nlp.pipeline)

[('tagger', <spacy.pipeline.Tagger object at 0x0000018867DF3C50>), ('parser', <spacy.pipeline.DependencyParser object at 0x000001883F3849E8>), ('ner', <spacy.pipeline.EntityRecognizer object at 0x000001883F360A98>)]


# Creating a custom pipeline
Let’s build a custom pipeline that needs to be applied after the tagger pipeline is ran. We need the POS tags to get the Synset from Wordnet.

In [47]:
from nltk.corpus import wordnet as wn
from spacy.tokens import Token
 
 
def penn_to_wn(tag):
    if tag.startswith('N'):
        return 'n'
 
    if tag.startswith('V'):
        return 'v'
 
    if tag.startswith('J'):
        return 'a'
 
    if tag.startswith('R'):
        return 'r'
 
    return None
 
 
class WordnetPipeline(object):
    def __init__(self, nlp):
        Token.set_extension('synset', force=True, default=None)
 
    def __call__(self, doc):
        for token in doc:
            wn_tag = penn_to_wn(token.tag_)
            if wn_tag is None:
                continue
 
            ss = wn.synsets(token.text, wn_tag)[0]
            token._.set('synset', ss)
 
        return doc
 

In [48]:
nlp = spacy.load('en')
wn_pipeline = WordnetPipeline(nlp)
nlp.add_pipe(wn_pipeline, name='wn_synsets')
doc = nlp("Paris is the awesome capital of France.")
 
for token in doc:
    print(token.text, "-", token._.synset)

Paris - Synset('paris.n.01')
is - Synset('be.v.01')
the - None
awesome - Synset('amazing.s.02')
capital - Synset('capital.n.01')
of - None
France - Synset('france.n.01')
. - None


# Conclusions
spaCy is a modern, reliable NLP framework that quickly became the standard for doing NLP with Python. Its main advantages are: speed, accuracy, extensibility. It also comes shipped with useful assets like word embeddings. It can act as the central part of your production NLP pipeline.