In [25]:
#!/usr/bin/env python
# coding: utf8
"""Example of training spaCy's named entity recognizer, starting off with an
existing model or a blank model.
For more details, see the documentation:
* Training: https://spacy.io/usage/training
* NER: https://spacy.io/usage/linguistic-features#named-entities
Compatible with: spaCy v2.0.0+
"""
from __future__ import unicode_literals, print_function

import plac
import random
from pathlib import Path
import spacy
import collections
import codecs
import sklearn

# training data
train_data = [
    ('Who is Shaka Khan?', {
        'entities': [(7, 17, 'PERSON')]
    }),
    ('I like London and Berlin.', {
        'entities': [(7, 13, 'LOC'), (18, 24, 'LOC')]
    })
]
    
def train_ner(model=None, output_dir=None, n_iter=100, train_data=train_data):
    """Load the model, set up the pipeline and train the entity recognizer."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")

    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
    else:
        ner = nlp.get_pipe('ner')

    # add labels
    for _, annotations in train_data:
        for ent in annotations.get('entities'):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(train_data)
            losses = {}
            for text, annotations in train_data:
                nlp.update(
                    [text],  # batch of texts
                    [annotations],  # batch of annotations
                    drop=0.5,  # dropout - make it harder to memorise data
                    sgd=optimizer,  # callable to update weights
                    losses=losses)
            print(losses)

    # test the trained model
    for text, _ in train_data:
        doc = nlp(text)
        print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
        print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])


In [33]:
def parse_dataset(dataset_filepath):
    token_count = collections.defaultdict(lambda: 0)
    label_count = collections.defaultdict(lambda: 0)
    character_count = collections.defaultdict(lambda: 0)

    line_count = -1
    tokens = []
    labels = []
    new_token_sequence = []
    new_label_sequence = []
    if dataset_filepath:
        f = codecs.open(dataset_filepath, 'r', 'UTF-8')
        for line in f:
            line_count += 1
            line = line.strip().split(' ')
            if len(line) == 0 or len(line[0]) == 0 or '-DOCSTART-' in line[0]:
                if len(new_token_sequence) > 0:
                    labels.append(new_label_sequence)
                    tokens.append(new_token_sequence)
                    new_token_sequence = []
                    new_label_sequence = []
                continue
            token = str(line[0])
            label = str(line[-1])
            token_count[token] += 1
            label_count[label] += 1

            new_token_sequence.append(token)
            new_label_sequence.append(label)

            for character in token:
                character_count[character] += 1

            if line_count > 20: break# for debugging purposes

        if len(new_token_sequence) > 0:
            labels.append(new_label_sequence)
            tokens.append(new_token_sequence)
        f.close()
    return labels, tokens, token_count, label_count, character_count

In [34]:
labels, tokens, _, _, _ = parse_dataset('en/train.txt')
print(labels)
print(tokens)
# train_ner(n_iter=10, train_data=train_data)

[['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O'], ['B-PER', 'I-PER'], ['B-LOC', 'O'], ['O', 'B-ORG', 'I-ORG', 'O']]
[['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'], ['Peter', 'Blackburn'], ['BRUSSELS', '1996-08-22'], ['The', 'European', 'Commission', 'said']]




Using spaCy CLI for training. 

In [5]:
!python -m spacy convert en/train.txt data -c ner


[93m    Generated output file data/train.txt.json[0m
    Created 1 documents



In [7]:
!python -m spacy convert en/test.txt data -c ner


[93m    Generated output file data/test.txt.json[0m
    Created 1 documents



In [9]:
!python -m spacy convert en/valid.txt data -c ner


[93m    Generated output file data/valid.txt.json[0m
    Created 1 documents



In [22]:
!python -m spacy train en model data/train.txt.json data/valid.txt.json -G -T -P

dropout_from = 0.2 by default
dropout_to = 0.2 by default
dropout_decay = 0.0 by default
batch_from = 1 by default
batch_to = 16 by default
batch_compound = 1.001 by default
max_doc_len = 5000 by default
beam_width = 1 by default
beam_density = 0.0 by default
learn_rate = 0.001 by default
optimizer_B1 = 0.9 by default
optimizer_B2 = 0.999 by default
optimizer_eps = 1e-08 by default
L2_penalty = 1e-06 by default
grad_norm_clip = 1.0 by default
parser_hidden_depth = 1 by default
parser_maxout_pieces = 2 by default
token_vector_width = 128 by default
hidden_width = 200 by default
embed_size = 7000 by default
history_feats = 0 by default
history_width = 0 by default
Itn.	P.Loss	N.Loss	UAS	NER P.	NER R.	NER F.	Tag %	Token %
0	0.000	2455.070	0.000	81.650	82.447	82.047	0.000	100.000	11058.4	0.0           
1	0.000	23.767	0.000	85.959	86.957	86.455	0.000	100.000	10791.8	0.0             
2	0.000	14.371	0.000	87.707	88.371	88.038	0.000	100.000	10678.3	0.0             
3	0.000	10.352	0.000	87.926	

In [23]:
!python -m spacy evaluate model/model4 data/test.txt.json -dp result


    [93mResults[0m

    Time               2.89 s         
    Words              46666          
    NER P              76.84          
    TOK                100.00         
    LAS                0.00           
    NER R              78.17          
    NER F              77.50          
    UAS                0.00           
    Words/s            16130          
    POS                0.00           


[93m    Generated 25 parses as HTML[0m
    result

