In [1]:
import spacy

In [2]:
sp = spacy.load("en")

In [3]:
sp.vocab.length

489

In [4]:
#Opening paragraph from a tale of two cities
tex = sp("It was the best of times, it was the worst of times, it was the age of wisdom, it was the age of foolishness, it was the epoch of belief, it was the epoch of incredulity, it was the season of Light, it was the season of Darkness, it was the spring of hope, it was the winter of despair, we had everything before us, we had nothing before us, we were all going direct to Heaven, we were all going direct the other way—in short, the period was so far like the present period, that some of its noisiest authorities insisted on its being received, for good or for evil, in the superlative degree of comparison only.")

In [5]:
# Finding pos tags of all words
[(token.text, token.pos_) for token in tex]

[('It', 'PRON'),
 ('was', 'AUX'),
 ('the', 'DET'),
 ('best', 'ADJ'),
 ('of', 'ADP'),
 ('times', 'NOUN'),
 (',', 'PUNCT'),
 ('it', 'PRON'),
 ('was', 'AUX'),
 ('the', 'DET'),
 ('worst', 'ADJ'),
 ('of', 'ADP'),
 ('times', 'NOUN'),
 (',', 'PUNCT'),
 ('it', 'PRON'),
 ('was', 'AUX'),
 ('the', 'DET'),
 ('age', 'NOUN'),
 ('of', 'ADP'),
 ('wisdom', 'NOUN'),
 (',', 'PUNCT'),
 ('it', 'PRON'),
 ('was', 'AUX'),
 ('the', 'DET'),
 ('age', 'NOUN'),
 ('of', 'ADP'),
 ('foolishness', 'NOUN'),
 (',', 'PUNCT'),
 ('it', 'PRON'),
 ('was', 'AUX'),
 ('the', 'DET'),
 ('epoch', 'NOUN'),
 ('of', 'ADP'),
 ('belief', 'NOUN'),
 (',', 'PUNCT'),
 ('it', 'PRON'),
 ('was', 'AUX'),
 ('the', 'DET'),
 ('epoch', 'NOUN'),
 ('of', 'ADP'),
 ('incredulity', 'NOUN'),
 (',', 'PUNCT'),
 ('it', 'PRON'),
 ('was', 'AUX'),
 ('the', 'DET'),
 ('season', 'NOUN'),
 ('of', 'ADP'),
 ('Light', 'NOUN'),
 (',', 'PUNCT'),
 ('it', 'PRON'),
 ('was', 'AUX'),
 ('the', 'DET'),
 ('season', 'NOUN'),
 ('of', 'ADP'),
 ('Darkness', 'PROPN'),
 (',', 'PUNC

In [6]:
#what does each tag mean?
[(token.text, token.pos_, spacy.explain(token.pos_)) for token in tex]

[('It', 'PRON', 'pronoun'),
 ('was', 'AUX', 'auxiliary'),
 ('the', 'DET', 'determiner'),
 ('best', 'ADJ', 'adjective'),
 ('of', 'ADP', 'adposition'),
 ('times', 'NOUN', 'noun'),
 (',', 'PUNCT', 'punctuation'),
 ('it', 'PRON', 'pronoun'),
 ('was', 'AUX', 'auxiliary'),
 ('the', 'DET', 'determiner'),
 ('worst', 'ADJ', 'adjective'),
 ('of', 'ADP', 'adposition'),
 ('times', 'NOUN', 'noun'),
 (',', 'PUNCT', 'punctuation'),
 ('it', 'PRON', 'pronoun'),
 ('was', 'AUX', 'auxiliary'),
 ('the', 'DET', 'determiner'),
 ('age', 'NOUN', 'noun'),
 ('of', 'ADP', 'adposition'),
 ('wisdom', 'NOUN', 'noun'),
 (',', 'PUNCT', 'punctuation'),
 ('it', 'PRON', 'pronoun'),
 ('was', 'AUX', 'auxiliary'),
 ('the', 'DET', 'determiner'),
 ('age', 'NOUN', 'noun'),
 ('of', 'ADP', 'adposition'),
 ('foolishness', 'NOUN', 'noun'),
 (',', 'PUNCT', 'punctuation'),
 ('it', 'PRON', 'pronoun'),
 ('was', 'AUX', 'auxiliary'),
 ('the', 'DET', 'determiner'),
 ('epoch', 'NOUN', 'noun'),
 ('of', 'ADP', 'adposition'),
 ('belief', 'NO

In [7]:
#frequency of POS tags
pos_freq = tex.count_by(spacy.attrs.POS)
for k,v in sorted(pos_freq.items()):
    print(f'{tex.vocab[k].text:{6}}: {v}')

ADJ   : 9
ADP   : 20
ADV   : 5
AUX   : 16
CCONJ : 1
DET   : 17
NOUN  : 25
PRON  : 18
PROPN : 2
PUNCT : 19
SCONJ : 2
VERB  : 4


In [8]:
tex0 = sp("It's a good time to visit N.Y.C. this Sept. for 30,000 people.")

In [9]:
for token in tex0:
    print (token.text)

It
's
a
good
time
to
visit
N.Y.C.
this
Sept.
for
30,000
people
.


In [4]:
# Spacy visualiser
from spacy import displacy

In [11]:
tex1=sp("The dog walked up the hill.")

In [12]:
[(token.text, token.dep_) for token in tex1]

[('The', 'det'),
 ('dog', 'nsubj'),
 ('walked', 'ROOT'),
 ('up', 'prep'),
 ('the', 'det'),
 ('hill', 'pobj'),
 ('.', 'punct')]

In [13]:
displacy.render(tex1, style="dep", jupyter = True)

In [14]:
options = {"compact": True, "color": "blue"}
displacy.render(tex1, style="dep", options=options)

In [15]:
tex2 = sp("John has played basketball since 2010. He follows NBA")
displacy.render(tex2, style="ent")

In [16]:
tex3 = sp("Stacy has played basketball since 2010. Her favourite NBA team is the Lakers")
displacy.render(tex3, style="ent")
for entity in tex3.ents:
    print(entity.text + ' - ' + entity.label_ + ' - ' + str(spacy.explain(entity.label_)))

2010 - DATE - Absolute or relative dates or periods
NBA - ORG - Companies, agencies, institutions, etc.


In [17]:
# recognizing eliza as a person and lakers as an org
from spacy.tokens import Span

PERSON = tex3.vocab.strings[u'PERSON']
ORG = tex3.vocab.strings[u'ORG']
new_entity = Span(tex3, 0, 1, label=PERSON)
new_entity2 = Span(tex3, 13, 14, label=ORG)
tex3.ents = list(tex3.ents) + [new_entity] + [new_entity2]

In [18]:
displacy.render(tex3, style="ent")
for entity in tex3.ents:
    print(entity.text + ' - ' + entity.label_ + ' - ' + str(spacy.explain(entity.label_)))

Stacy - PERSON - People, including fictional
2010 - DATE - Absolute or relative dates or periods
NBA - ORG - Companies, agencies, institutions, etc.
Lakers - ORG - Companies, agencies, institutions, etc.


In [19]:
from spacy.lang.en import English

nlp = English()  
sentencizer = nlp.create_pipe("sentencizer")
nlp.add_pipe(sentencizer)
for sent in tex.sents:
    print(sent.text)

It was the best of times, it was the worst of times, it was the age of wisdom, it was the age of foolishness, it was the epoch of belief, it was the epoch of incredulity, it was the season of Light, it was the season of Darkness, it was the spring of hope, it was the winter of despair, we had everything before us, we had nothing before us, we were all going direct to Heaven, we were all going direct the other way—in short, the period was so far like the present period, that some of its noisiest authorities insisted on its being received, for good or for evil, in the superlative degree of comparison only.


In [20]:
# One very long sentence. As an example, I will make ',' a sentence boundary. Can be used for ellipses and emojis which are common sentence boundaries on social media

def set_custom_boundaries(tex):
    for token in tex[:-1]:
        if token.text == ",":
            tex[token.i+1].is_sent_start = True
    return tex

In [21]:
sp.add_pipe(set_custom_boundaries, before="parser")

In [22]:
tex = sp("It was the best of times, it was the worst of times, it was the age of wisdom, it was the age of foolishness, it was the epoch of belief, it was the epoch of incredulity, it was the season of Light, it was the season of Darkness, it was the spring of hope, it was the winter of despair, we had everything before us, we had nothing before us, we were all going direct to Heaven, we were all going direct the other way—in short, the period was so far like the present period, that some of its noisiest authorities insisted on its being received, for good or for evil, in the superlative degree of comparison only.")
for sent in tex.sents:
    print(sent.text)

It was the best of times,
it was the worst of times,
it was the age of wisdom,
it was the age of foolishness,
it was the epoch of belief,
it was the epoch of incredulity,
it was the season of Light,
it was the season of Darkness,
it was the spring of hope,
it was the winter of despair,
we had everything before us,
we had nothing before us,
we were all going direct to Heaven,
we were all going direct the other way—in short,
the period was so far like the present period,
that some of its noisiest authorities insisted on its being received,
for good or for evil,
in the superlative degree of comparison only.


In [2]:
nlp = spacy.load('en_core_web_lg')

In [24]:
nlp.pipe_names
nlp.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x2c24ae52ec8>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x2c247f29dc8>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x2c247f29b88>)]

In [25]:
tokens = nlp("apple dog broken banana cat onomatopoeia asdfkj")
# asdfkj is a variant of "keyboard smashing" - a phenomenon which is seen on most social media
for token in tokens:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov)

apple True 7.1346846 False
dog True 7.0336733 False
broken True 5.5968375 False
banana True 6.700014 False
cat True 6.6808186 False
onomatopoeia True 6.8262777 False
asdfkj False 0.0 True


In [26]:
# comparing similarities
for tok1 in tokens:
    for tok2 in tokens:
        print(tok1.text, tok2.text, tok1.similarity(tok2))

apple apple 1.0
apple dog 0.2633902
apple broken 0.30717564
apple banana 0.5831844
apple cat 0.28213844
apple onomatopoeia 0.041739788
apple asdfkj 0.0
dog apple 0.2633902
dog dog 1.0
dog broken 0.2948628
dog banana 0.24327648
dog cat 0.80168545
dog onomatopoeia 0.02036748
dog asdfkj 0.0
broken apple 0.30717564
broken dog 0.2948628
broken broken 1.0
broken banana 0.25774238
broken cat 0.30218005
broken onomatopoeia -0.06426965
broken asdfkj 0.0
banana apple 0.5831844
banana dog 0.24327648
banana broken 0.25774238
banana banana 1.0
banana cat 0.28154367
banana onomatopoeia 0.046276174
banana asdfkj 0.0
cat apple 0.28213844
cat dog 0.80168545
cat broken 0.30218005
cat banana 0.28154367
cat cat 1.0
cat onomatopoeia 0.040334832
cat asdfkj 0.0
onomatopoeia apple 0.041739788
onomatopoeia dog 0.02036748
onomatopoeia broken -0.06426965
onomatopoeia banana 0.046276174
onomatopoeia cat 0.040334832
onomatopoeia onomatopoeia 1.0
onomatopoeia asdfkj 0.0
asdfkj apple 0.0
asdfkj dog 0.0
asdfkj broken

  after removing the cwd from sys.path.


In [27]:
tex4 = nlp("I like cats")
tex5 = nlp("I like dogs")

In [28]:
# Compare 2 tokens
tex4[2].similarity(tex5[2])

0.83117634

In [29]:
# SpaCy recognition might be limited

test1 = nlp("Aradhita wants a cat. Her cat wants a dog. But horses are the best.")
for ent in test1.ents:
    print(ent.text, ent.label_)

Aradhita ORG


In [30]:
# I want SpaCy to identify products, and to identify my name as a person

In [31]:
ner=nlp.get_pipe("ner")
#ner is pipeline component

In [32]:
#random training data, I am also trying to train a new entity called animal
TRAINING_DATA=[
    ("I left Vellore yesterday.", {"entities": [(7, 14, "GPE")]}),
    ("I need to buy more clothes.", {"entities": [(19, 26, "PRODUCT")]}),
    ("I rented a house.", {"entities": [(11, 16, "PRODUCT")]}),
    ("Fridge needs to be replaced ASAP ", {"entities": [(0,6, "PRODUCT")]}),
    ("Fridge can be ordered in Amazon ", {"entities": [(0,6, "PRODUCT")]}),
    ("There has been severe flooding in North India", {"entities": [(34, 45, "GPE")]}),
    ("I got my truck stolen", {"entities": [(9,14, "PRODUCT")]}),
    ("Aprajita orders clothes from amazon", {"entities": [(0,8, "PERSON")]}),
    ("I recently ordered from Shoppers Stop", {"entities": [(24,37,"ORG")]}),
    ("I bought a new bicycle", {"entities": [(15,22, "PRODUCT")]}),
    ("I donated my old toys", {"entities": [(17,21, "PRODUCT")]}),
    ("I bought a fancy new watch", {"entities": [(21,26, "PRODUCT")]}),
    ("I rented a cabin for our vacation", {"entities": [(11,16, "PRODUCT")]}),
    ("I borrowed a ball from our neighbour", {"entities": [(13,17, "PRODUCT")]}),
    ("I repaired my car", {"entities": [(14,17, "PRODUCT")]}),
    ("I got my computer fixed", {"entities": [(9,17, "PRODUCT")]}),
    ("Richa is starting school today", {"entities":[(0,5,"PERSON")]}),
    ("They adopted a boy named Amar", {"entities":[(25, 29,"PERSON")]}),
    ("Sanjay Dutt released a new film", {"entities":[(0,11,"PERSON")]}),
    ("Horses are too tall and they will hurt your feelings", {"entities":[(0, 6, "ANIMAL")]}),
    ("I want a dog", {"entities":[(9,12,"ANIMAL")]}),
    ("Cats are known to be evil", {"entities":[(0,4,"ANIMAL")]}),
    ("I saw a bird today", {"entities":[(8, 12, "ANIMAL")]}),
    ("Snoopy is a dog", {"entities":[(12,15,"ANIMAL")]}),
    ("Rio is a movie about birds", {"entities":[(21, 26,"ANIMAL")]}),
    ("Dogs should not eat chocolate", {"entities":[(0,4,"ANIMALS")]}),
    ("Cows give milk", {"entities":[(0,4,"ANIMALS")]}),
    ("Goats eat grass", {"entities":[(0, 5, "ANIMALS")]}),
    ("I do not buy Donkeys on Amazon", {"entities":[(13, 20, "ANIMALS")]}),
    ("Tigers are predators", {"entities":[(0, 6, "ANIMALS")]}),
    ("Rachna uses Amazon regularly", {"entities":[(0,6,"PERSON")]}),
    ("Archana uses Flipkart to order clothes", {"entities":[(0, 7, "PERSON")]}),
    ("Aparna has a purse", {"entities":[(0, 6, "PERSON")]})
   ]

In [33]:
# Adding labels to the `ner`

for _, annotations in TRAINING_DATA:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])

In [34]:
# Disable pipeline components that should not be changed
pipe_exc = ["ner", "trf_wordpiecer", "trf_tok2vec"]
disabled_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exc]

In [35]:
# We need random to randomize ip, minibatch to create minibatches of text data, compouding func to yield an infinite series of compouding values
import random
from spacy.util import minibatch, compounding
from pathlib import Path

In [36]:
# TRAINING THE MODEL
with nlp.disable_pipes(*disabled_pipes):

  # Training for 180 iterations
  for iteration in range(180):

    # shuffling examples  before every iteration
    random.shuffle(TRAINING_DATA)
    losses = {}
    # batch up the examples using spaCy's minibatch
    batches = minibatch(TRAINING_DATA, size=compounding(4.0, 32.0, 1.001))
    for batch in batches:
        texts, annotations = zip(*batch)
        nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.5,  # dropout - make it harder to memorise data
                    losses=losses,
                )
        print("Losses", losses)

Losses {'ner': 13.296498000621796}
Losses {'ner': 36.52636879682541}
Losses {'ner': 51.27628618478775}
Losses {'ner': 78.09206920862198}
Losses {'ner': 100.09135013818741}
Losses {'ner': 124.22386175394058}
Losses {'ner': 144.29460698366165}
Losses {'ner': 161.07301169633865}
Losses {'ner': 168.07350224260935}
Losses {'ner': 18.900981903076172}
Losses {'ner': 34.708202838897705}
Losses {'ner': 47.86578910052776}
Losses {'ner': 75.4342643469572}
Losses {'ner': 106.97110812366009}
Losses {'ner': 133.28087060153484}
Losses {'ner': 167.75682799518108}
Losses {'ner': 183.59374026954174}
Losses {'ner': 185.41481411298037}
Losses {'ner': 22.266671419143677}
Losses {'ner': 38.98586678504944}
Losses {'ner': 56.914355665373705}
Losses {'ner': 81.49305668466081}
Losses {'ner': 110.428072124648}
Losses {'ner': 130.881619125533}
Losses {'ner': 153.8698855935811}
Losses {'ner': 173.45071807496538}
Losses {'ner': 176.72583468176458}
Losses {'ner': 28.51737880706787}
Losses {'ner': 49.70157480239868}


In [37]:
print(test1)
for ent in test1.ents:
    print(ent.text, ent.label_)

Aradhita wants a cat. Her cat wants a dog. But horses are the best.
Aradhita ORG


In [38]:
test2 = nlp("Rabbits are nice.")
test3 = nlp("Shriya is nice")
test4 = nlp("Roland owns a watch.")
test5 = nlp("I saw a snake today.")

In [39]:
print(test2)
displacy.render(test2, style="ent")

Rabbits are nice.


In [40]:
print(test3)
displacy.render(test3, style="ent")

Shriya is nice


In [41]:
print(test4)
displacy.render(test4, style="ent")

Roland owns a watch.


In [42]:
print(test5)
displacy.render(test5, style="ent")

I saw a snake today.


