Ref: https://spacy.io/usage/spacy-101

In [2]:
import random
import spacy
from spacy import displacy

# Load Model and Pipeline

<img src="./figure/pipeline.png" width="600">


In [3]:
nlp = spacy.load('en_core_web_sm')

In [4]:
nlp.meta

{'lang': 'en',
 'pipeline': ['tagger', 'parser', 'ner'],
 'accuracy': {'token_acc': 99.8698372794,
  'ents_p': 84.9664503965,
  'ents_r': 85.6312524451,
  'uas': 91.7237657538,
  'tags_acc': 97.0403350292,
  'ents_f': 85.2975560875,
  'las': 89.800872413},
 'name': 'core_web_sm',
 'license': 'CC BY-SA 3.0',
 'author': 'Explosion AI',
 'url': 'https://explosion.ai',
 'vectors': {'width': 0, 'vectors': 0, 'keys': 0, 'name': None},
 'sources': ['OntoNotes 5', 'Common Crawl'],
 'version': '2.0.0',
 'spacy_version': '>=2.0.0a18',
 'parent_package': 'spacy',
 'speed': {'gpu': None, 'nwords': 291344, 'cpu': 5122.3040471407},
 'email': 'contact@explosion.ai',
 'description': 'English multi-task CNN trained on OntoNotes, with GloVe vectors trained on Common Crawl. Assigns word vectors, context-specific token vectors, POS tags, dependency parse and named entities.'}

In [5]:
nlp.pipeline

[('tagger', <spacy.pipeline.Tagger at 0x104220c18>),
 ('parser', <spacy.pipeline.DependencyParser at 0x113814620>),
 ('ner', <spacy.pipeline.EntityRecognizer at 0x113814678>)]

In [6]:
type(nlp)

spacy.lang.en.English

**Customized Pipeline** takes in `Doc` and returns `Doc`

In [7]:
def my_component(doc):
    print("After tokenization, this doc has %s tokens.\n\n" % len(doc))
    return doc
nlp.add_pipe(my_component, name='print_info', first=True)

# Token Attributes

<img src="https://spacy.io/assets/img/architecture.svg" width="600">


In [10]:
doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')
for token in doc:
    print(token.text, 
          token.lemma_, 
          token.pos_, 
          token.tag_, 
          token.dep_,
          token.shape_, 
          token.is_alpha, 
          token.is_stop,
          token.ent_type_, # one entity has multiple tokens
          token.ent_iob_)

After tokenization, this doc has 11 tokens.


Apple apple PROPN NNP nsubj Xxxxx True False ORG B
is be VERB VBZ aux xx True True  O
looking look VERB VBG ROOT xxxx True False  O
at at ADP IN prep xx True True  O
buying buy VERB VBG pcomp xxxx True False  O
U.K. u.k. PROPN NNP compound X.X. False False GPE B
startup startup NOUN NN dobj xxxx True False  O
for for ADP IN prep xxx True True  O
$ $ SYM $ quantmod $ False False MONEY B
1 1 NUM CD compound d False False MONEY I
billion billion NUM CD pobj xxxx True False MONEY I


## noun chunks

In [11]:
for chunk in doc.noun_chunks:
    print(chunk.text, 
          chunk.root.text, 
          chunk.root.dep_,
          chunk.root.head.text)

Apple Apple nsubj looking
U.K. startup startup dobj buying


## Dependency parsing

In [12]:
for token in doc:
    print(token.text, 
          token.dep_, 
          token.head.text, 
          token.head.pos_,
          [child for child in token.children])

Apple nsubj looking VERB []
is aux looking VERB []
looking ROOT looking VERB [Apple, is, at]
at prep looking VERB [buying]
buying pcomp at ADP [startup, for]
U.K. compound startup NOUN []
startup dobj buying VERB [U.K.]
for prep buying VERB [billion]
$ quantmod billion NUM []
1 compound billion NUM []
billion pobj for ADP [$, 1]


In [13]:
displacy.render(doc, style='dep', jupyter=True)

## Similarity
** Similarity** is determined by comparing word vectors or "word embeddings",

In [14]:
for token1 in doc[:3]:
    for token2 in doc[:3]:
        print(token1.text, 
              token2.text, 
              token1.similarity(token2))

Apple Apple 1.0
Apple is -0.056064684
Apple looking 0.1736301
is Apple -0.056064684
is is 1.0
is looking 0.0364612
looking Apple 0.1736301
looking is 0.0364612
looking looking 1.0


## Customized Tokenizer

In [15]:
class WhitespaceTokenizer(object):
    def __init__(self, vocab):
        self.vocab = vocab

    def __call__(self, text):
        words = text.split(' ')
        # All tokens 'own' a subsequent space character in this tokenizer
        spaces = [True] * len(words)
        return Doc(self.vocab, 
                   words = words, 
                   spaces = spaces)

In [16]:
nlp.tokenizer = WhitespaceTokenizer(nlp.vocab)

# Entity/Span Attributes

In [17]:
type(doc.ents[0])

spacy.tokens.span.Span

In [18]:
for ent in doc.ents:
    print(ent.text, 
          ent.start_char, 
          ent.end_char, 
          ent.label_, 
          ent.label)

Apple 0 5 ORG 381
U.K. 27 31 GPE 382
$1 billion 44 54 MONEY 391


In [20]:
# token level
ent_apple = [doc[0].text, doc[0].ent_iob_, doc[0].ent_type_]
ent_is = [doc[1].text, doc[1].ent_iob_, doc[1].ent_type_]
print(ent_apple)
print(ent_is)

['Apple', 'B', 'ORG']
['is', 'O', '']


In [28]:
displacy.render(doc, style='ent', jupyter=True)

# Sentences

>Unlike other libraries, spaCy uses the dependency parse to determine sentence boundaries

In [48]:
nlp = spacy.load('en_core_web_sm')
doc = nlp(u"this is a sentence...hello...and another sentence.")

In [49]:
for sent in doc.sents:
    print(sent.text)

this is a sentence...
hello...and another sentence.


In [50]:
def set_custom_boundaries(doc):
    for token in doc[:-1]:
        if token.text == '...':
            doc[token.i+1].is_sent_start = True
    return doc

In [51]:
for token in doc:
    print(token.text,
          token.is_sent_start)

this None
is None
a None
sentence None
... None
hello True
... None
and None
another None
sentence None
. None


In [52]:
nlp.add_pipe(set_custom_boundaries, before='parser')
nlp.pipe_names

['tagger', 'set_custom_boundaries', 'parser', 'ner']

In [53]:
doc = nlp(u"this is a sentence...hello...and another sentence.")
for token in doc:
    print(token.text,
          token.is_sent_start)

this None
is None
a None
sentence None
... None
hello True
... None
and True
another None
sentence None
. None


In [54]:
for sent in doc.sents:
    print(sent.text)

this is a sentence...
hello...
and another sentence.


# Vocab, hash, lexeme

<img src="./figure/doc_vocab.png" width="400">

>The central data structures in spaCy are the Doc and the Vocab. The Doc object owns the sequence of tokens and all their annotations. The Vocab object owns a set of look-up tables that make common information available across documents. By centralising strings, word vectors and lexical attributes, we avoid storing multiple copies of this data. This saves memory, and ensures there's a single source of truth.

In [33]:
doc.vocab

<spacy.vocab.Vocab at 0x10fba87c8>

## hashes
**hashes** save memory/space

In [34]:
doc.vocab.strings

<spacy.strings.StringStore at 0x10a42e828>

In [32]:
print(doc.vocab.strings[u'coffee'])  # 3197928453018144401
print(doc.vocab.strings[3197928453018144401])  # 'coffee'

3197928453018144401
coffee


## Lexeme
**lexeme** all information independent of context

In [36]:
doc.vocab[word.text]

<spacy.lexeme.Lexeme at 0x1265d9cf0>

In [56]:
doc = nlp(u'I love coffee')
for word in doc:
    lexeme = doc.vocab[word.text]
    print(lexeme.text, 
          lexeme.orth, 
          lexeme.shape_, 
          lexeme.prefix_, 
          lexeme.suffix_,
          lexeme.is_alpha, 
          lexeme.is_digit, 
          lexeme.is_title, 
          lexeme.lang_)

I 4690420944186131903 X I I True False True en
love 3702023516439754181 xxxx l ove True False False en
coffee 3197928453018144401 xxxx c fee True False False en


**doc** contains tokens in context

# Convert to `numpy`

In [65]:
from spacy.attrs import ORTH
attr_ids = [ORTH]
doc_array = doc.to_array(attr_ids)
doc_array

array([4690420944186131903, 3702023516439754181, 3197928453018144401],
      dtype=uint64)

# Serialization

In [43]:
doc.to_disk('./model/spacy_test.bin') 

In [45]:
from spacy.tokens import Doc # to create empty Doc
from spacy.vocab import Vocab # to create empty Vocab
doc = Doc(Vocab()).from_disk('./model/spacy_test.bin') # load processed Doc

# NER Model training

- https://spacy.io/usage/linguistic-features#section-named-entities
- https://spacy.io/usage/training#section-ner

In [26]:
nlp = spacy.load('en_core_web_sm')
train_data = [("Uber blew through $1 million", {'entities': [(0, 4, 'PERSON')]})] # start 0, end 4

with nlp.disable_pipes(*[pipe for pipe in nlp.pipe_names if pipe != 'ner']):
    optimizer = nlp.begin_training()
    for i in range(10):
        random.shuffle(train_data)
        for text, annotations in train_data:
            nlp.update([text], [annotations], sgd=optimizer)
nlp.to_disk('./model/temp_spacy')



In [27]:
doc = nlp(u'Uber blew through $1 million')

In [28]:
for ent in doc.ents:
    print(ent.text, 
          ent.start_char, 
          ent.end_char, 
          ent.label_, 
          ent.label)

Uber 0 4 PERSON 378
