Ref: https://spacy.io/usage/spacy-101

In [52]:
import random
import spacy
from spacy import displacy

# Load Model and Pipeline

In [None]:
nlp = spacy.load('en_core_web_sm')

In [46]:
nlp.pipeline

[('tagger', <spacy.pipeline.Tagger at 0x11227b240>),
 ('parser', <spacy.pipeline.DependencyParser at 0x1122872b0>),
 ('ner', <spacy.pipeline.EntityRecognizer at 0x112287308>)]

**Customized Pipeline** takes in `Doc` and returns `Doc`

In [47]:
def my_component(doc):
    print("After tokenization, this doc has %s tokens." % len(doc))
    return doc
nlp.add_pipe(my_component, name='print_info', first=True)

# Tokens

In [48]:
tokens = nlp(u'Apple is looking at buying U.K. startup for $1 billion')
for token in tokens:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
          token.shape_, token.is_alpha, token.is_stop)

After tokenization, this doc has 11 tokens.
Apple apple PROPN NNP nsubj Xxxxx True False
is be VERB VBZ aux xx True True
looking look VERB VBG ROOT xxxx True False
at at ADP IN prep xx True True
buying buy VERB VBG pcomp xxxx True False
U.K. u.k. PROPN NNP compound X.X. False False
startup startup NOUN NN dobj xxxx True False
for for ADP IN prep xxx True True
$ $ SYM $ quantmod $ False False
1 1 NUM CD compound d False False
billion billion NUM CD pobj xxxx True False


In [28]:
type(tokens)

spacy.tokens.doc.Doc

In [22]:
spacy.explain("VBG")

'verb, gerund or present participle'

http://localhost:5000/

**dependency parsing**

In [30]:
displacy.render(doc, style='dep', jupyter=True)

** Similarity** is determined by comparing word vectors or "word embeddings",

In [26]:
for token1 in tokens[:3]:
    for token2 in tokens[:3]:
        print(token1.text, token2.text, token1.similarity(token2))

Apple Apple 1.0
Apple is -0.056064684
Apple looking 0.1736301
is Apple -0.056064684
is is 1.0
is looking 0.0364612
looking Apple 0.1736301
looking is 0.0364612
looking looking 1.0


# Entities

In [18]:
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Apple 0 5 ORG
U.K. 27 31 GPE
$1 billion 44 54 MONEY


http://localhost:5000/

In [31]:
displacy.render(doc, style='ent', jupyter=True)

# Vocab, hash, lexeme

<img src="./figure/doc_vocab.png" width="400">

>The central data structures in spaCy are the Doc and the Vocab. The Doc object owns the sequence of tokens and all their annotations. The Vocab object owns a set of look-up tables that make common information available across documents. By centralising strings, word vectors and lexical attributes, we avoid storing multiple copies of this data. This saves memory, and ensures there's a single source of truth.

In [33]:
doc.vocab

<spacy.vocab.Vocab at 0x10fba87c8>

**hashes** save memory/space

In [34]:
doc.vocab.strings

<spacy.strings.StringStore at 0x10a42e828>

In [32]:
print(doc.vocab.strings[u'coffee'])  # 3197928453018144401
print(doc.vocab.strings[3197928453018144401])  # 'coffee'

3197928453018144401
coffee


**lexeme** all information independent of context

In [36]:
doc.vocab[word.text]

<spacy.lexeme.Lexeme at 0x1265d9cf0>

In [35]:
doc = nlp(u'I love coffee')
for word in doc:
    lexeme = doc.vocab[word.text]
    print(lexeme.text, lexeme.orth, lexeme.shape_, lexeme.prefix_, lexeme.suffix_,
          lexeme.is_alpha, lexeme.is_digit, lexeme.is_title, lexeme.lang_)

I 4690420944186131903 X I I True False True en
love 3702023516439754181 xxxx l ove True False False en
coffee 3197928453018144401 xxxx c fee True False False en


**doc** contains tokens in context

In [38]:
type(doc)

spacy.tokens.doc.Doc

In [42]:
doc[1].dep_

'ROOT'

# Convert to `numpy`

In [65]:
from spacy.attrs import ORTH
attr_ids = [ORTH]
doc_array = doc.to_array(attr_ids)
doc_array

array([4690420944186131903, 3702023516439754181, 3197928453018144401],
      dtype=uint64)

# Serialization

In [43]:
doc.to_disk('./model/spacy_test.bin') 

In [45]:
from spacy.tokens import Doc # to create empty Doc
from spacy.vocab import Vocab # to create empty Vocab
doc = Doc(Vocab()).from_disk('./model/spacy_test.bin') # load processed Doc

# Model training with NN

In [56]:
nlp = spacy.load('en_core_web_sm')
train_data = [("Uber blew through $1 million", {'entities': [(0, 4, 'ORG')]})] # start 0, end 4

with nlp.disable_pipes(*[pipe for pipe in nlp.pipe_names if pipe != 'ner']):
    optimizer = nlp.begin_training()
    for i in range(10):
        random.shuffle(train_data)
        for text, annotations in train_data:
            nlp.update([text], [annotations], sgd=optimizer)
nlp.to_disk('./model/temp_spacy')

