In [25]:
#!/usr/bin/env python
# coding: utf8
"""Example of training spaCy's named entity recognizer, starting off with an
existing model or a blank model.
For more details, see the documentation:
* Training: https://spacy.io/usage/training
* NER: https://spacy.io/usage/linguistic-features#named-entities
Compatible with: spaCy v2.0.0+
"""
from __future__ import unicode_literals, print_function

import plac
import random
from pathlib import Path
import spacy
import collections
import codecs
import sklearn

# training data
train_data = [
    ('Who is Shaka Khan?', {
        'entities': [(7, 17, 'PERSON')]
    }),
    ('I like London and Berlin.', {
        'entities': [(7, 13, 'LOC'), (18, 24, 'LOC')]
    })
]
    
def train_ner(model=None, output_dir=None, n_iter=100, train_data=train_data):
    """Load the model, set up the pipeline and train the entity recognizer."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")

    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
    else:
        ner = nlp.get_pipe('ner')

    # add labels
    for _, annotations in train_data:
        for ent in annotations.get('entities'):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(train_data)
            losses = {}
            for text, annotations in train_data:
                nlp.update(
                    [text],  # batch of texts
                    [annotations],  # batch of annotations
                    drop=0.5,  # dropout - make it harder to memorise data
                    sgd=optimizer,  # callable to update weights
                    losses=losses)
            print(losses)

    # test the trained model
    for text, _ in train_data:
        doc = nlp(text)
        print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
        print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])


In [33]:
def parse_dataset(dataset_filepath):
    token_count = collections.defaultdict(lambda: 0)
    label_count = collections.defaultdict(lambda: 0)
    character_count = collections.defaultdict(lambda: 0)

    line_count = -1
    tokens = []
    labels = []
    new_token_sequence = []
    new_label_sequence = []
    if dataset_filepath:
        f = codecs.open(dataset_filepath, 'r', 'UTF-8')
        for line in f:
            line_count += 1
            line = line.strip().split(' ')
            if len(line) == 0 or len(line[0]) == 0 or '-DOCSTART-' in line[0]:
                if len(new_token_sequence) > 0:
                    labels.append(new_label_sequence)
                    tokens.append(new_token_sequence)
                    new_token_sequence = []
                    new_label_sequence = []
                continue
            token = str(line[0])
            label = str(line[-1])
            token_count[token] += 1
            label_count[label] += 1

            new_token_sequence.append(token)
            new_label_sequence.append(label)

            for character in token:
                character_count[character] += 1

            if line_count > 20: break# for debugging purposes

        if len(new_token_sequence) > 0:
            labels.append(new_label_sequence)
            tokens.append(new_token_sequence)
        f.close()
    return labels, tokens, token_count, label_count, character_count

In [34]:
labels, tokens, _, _, _ = parse_dataset('en/train.txt')
print(labels)
print(tokens)
# train_ner(n_iter=10, train_data=train_data)

[['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O'], ['B-PER', 'I-PER'], ['B-LOC', 'O'], ['O', 'B-ORG', 'I-ORG', 'O']]
[['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'], ['Peter', 'Blackburn'], ['BRUSSELS', '1996-08-22'], ['The', 'European', 'Commission', 'said']]


In [None]:
def convert_to_indices(labels, tokens, dataset_types):
    tokens = self.tokens
    labels = self.labels
    token_to_index = self.token_to_index
    character_to_index = self.character_to_index
    label_to_index = self.label_to_index
    index_to_label = self.index_to_label

    # Map tokens and labels to their indices
    token_indices = {}
    label_indices = {}
    characters = {}
    token_lengths = {}
    character_indices = {}
    character_indices_padded = {}
    for dataset_type in dataset_types:
        token_indices[dataset_type] = []
        characters[dataset_type] = []
        character_indices[dataset_type] = []
        token_lengths[dataset_type] = []
        character_indices_padded[dataset_type] = []
        for token_sequence in tokens[dataset_type]:
            token_indices[dataset_type].append([token_to_index.get(token, self.UNK_TOKEN_INDEX) for token in token_sequence])
            characters[dataset_type].append([list(token) for token in token_sequence])
            character_indices[dataset_type].append([[character_to_index.get(character, random.randint(1, max(self.index_to_character.keys()))) for character in token] for token in token_sequence])
            token_lengths[dataset_type].append([len(token) for token in token_sequence])
            longest_token_length_in_sequence = max(token_lengths[dataset_type][-1])
            character_indices_padded[dataset_type].append([utils.pad_list(temp_token_indices, longest_token_length_in_sequence, self.PADDING_CHARACTER_INDEX) for temp_token_indices in character_indices[dataset_type][-1]])

        label_indices[dataset_type] = []
        for label_sequence in labels[dataset_type]:
            label_indices[dataset_type].append([label_to_index[label] for label in label_sequence])

    # [Numpy 1-hot array](http://stackoverflow.com/a/42263603/395857)
    label_binarizer = sklearn.preprocessing.LabelBinarizer()
    label_binarizer.fit(range(max(index_to_label.keys()) + 1))
    label_vector_indices = {}
    for dataset_type in dataset_types:
        label_vector_indices[dataset_type] = []
        for label_indices_sequence in label_indices[dataset_type]:
            label_vector_indices[dataset_type].append(label_binarizer.transform(label_indices_sequence))


    return token_indices, label_indices, character_indices_padded, character_indices, token_lengths, characters, label_vector_indices


In [1]:
!python -m spacy convert en/train.txt data


[93m    Unknown format[0m
    Can't find converter for txt

