In [136]:
import json
import math
import os
import random
from operator import itemgetter
from pathlib import Path
from pprint import pprint

import spacy
from spacy import displacy
from spacy.util import compounding, minibatch

# spaCy NER Workflow
# a blog post about this work is available here: https://content.fromthepage.com/machine-learning-to-extract-entities-from-ancient-greek-and-other-languages/This notebooks defines a workflow for training a spaCy NER model. The code below uses an ancient greek training set with two types of entities `PERSON` and `LOC`.

## Settings
`TRAIN_FILEPATH`: path to a `json` file with the training data in the format:
```
[
['text with one entity only', {
    'entities': [[start, end, 'ENTITY_TYPE']]
}],
['text with no entities', {'entities': []}],
['text with multiple entities', {
    'entities': [[start, end, 'ENTITY_TYPE'], [start, end, 'ENTITY_TYPE'], [start, end, 'ENTITY_TYPE']]
}]
]
```

`TRAIN_RATIO`: the amount of data to be used for training, between 0 and 1

`TRAIN_LANGUAGE`: the language for the model being trained. See https://spacy.io/usage/models#alpha-support for more information on the language codes, note that it doesn't seem to support ancient language codes

`TRAIN_ITERATION`: number of iterations to run the training, start with a small number to test the workflow and increase later to improve accuracy

`TRAIN_MODEL_DIR`: name of the directory to save the model into

`TRAIN_MODEL_NAME`: name for the new/trained model

In [134]:
TRAIN_FILEPATH = 'training/json/agathemerus-with-context.json'
TRAIN_RATIO = 0.9

TRAIN_LANGUAGE = 'el'
TRAIN_ITERATIONS = 5

TRAIN_MODEL_DIR = os.path.join('models', TRAIN_LANGUAGE)
TRAIN_MODEL_NAME = '{}-ner'.format(TRAIN_LANGUAGE)

## Load the training data
Before we can start training the NER model we need to load some training data for the model to learn how to classify the entities

In [119]:
train_file = open(TRAIN_FILEPATH)
data = json.load(train_file)

size_of_data = len(data)

print('Items in training data', size_of_data)

Items in training data 601


In [120]:
# randomises the data
random.shuffle(data)
pprint(data[:3])

[[' ἀγχοῦ δ’ ἱσταμένη προσέφη πόδας ὠκέα Ἶρις· (790)', {'entities': []}],
 [' τύμβῳ ἐπ’ ἀκροτάτῳ Αἰσυήταο γέροντος,', {'entities': []}],
 ['βορρᾶν) οὕτως· ἀπὸ Ἀλεξανδρείας  εἰς Λίνδον Ῥόδου στάδια ˏδφ,',
  {'entities': [[37, 43, 'LOC'],
                [37, 49, 'LOC'],
                [44, 49, 'LOC'],
                [19, 31, 'LOC']]}]]


### Split the data into training and testing sets
The data is split into training and testing sets, with `TRAIN_RATIO` of the data used for training, the default is 90%.

In [121]:
threshold = math.floor(size_of_data * TRAIN_RATIO)

# splits the data into training
train_data = data[:threshold]
# and testing sets
test_data = data[threshold:]

## Create a new model
Create a bew blank model for the language provided and add an entity recogniser to the pipeline.

In [127]:
# creates a blank NER model for the language provided
nlp = spacy.blank(TRAIN_LANGUAGE)
ner = nlp.create_pipe('ner')
nlp.add_pipe(ner)

## Add the new entity labels
The new labels are added to the entity recognizer using the `add_label` method. The entity recognizer in accessible in the pipeline via `nlp.get_pipe('ner')`.

In [128]:
# dynamically adds the entity types we want to identify
for item in data:
    for entity in item[1]['entities']:
        ner.add_label(entity[2])

## Train the NER Model
Loop over the training data and call `nlp.update`, which steps through the words of the input. At each word, it makes a prediction. It then consults the annotations, to see whether it was right. If it was wrong, it adjusts its weights so that the correct action will score higher next time.

In [129]:
# initialises a pipeline for training
optimizer = nlp.begin_training()



In [130]:
# trains the model
for i in range(TRAIN_ITERATIONS):
    print('# Iteration', i)
    
    random.shuffle(train_data)
    losses = {}
    
    batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
    for batch in batches:
        texts, annotations = zip(*batch)
        nlp.update(texts, annotations, drop=0.5, losses=losses)

    print('Losses', losses)

# Iteration 0
Losses {'ner': 96.91940750711423}
# Iteration 1
Losses {'ner': 56.917766006483184}
# Iteration 2
Losses {'ner': 45.388349915962436}
# Iteration 3
Losses {'ner': 41.61771054131757}
# Iteration 4
Losses {'ner': 32.93759354703577}


## Test the NER Model
Test the model to make sure the new entity is recognised correctly.

For each of the items tested it displays the text annotated with the expected entities, and the text annotated with the matched entities.
It also displays the [precision and recall](https://en.wikipedia.org/wiki/Precision_and_recall) of the NER model. Precision is the number of correctly identitied entities divided by the number of all the identified entities. Recall is the number of corrected identitied entities divided by the number of expected entities.

In [126]:
total_true_positives = 0
total_false_positives = 0
total_expected_entities = 0

for text, annotations in test_data:        
    print('---')
    
    doc = nlp(text)
    
    expected = {
        'title': 'Expected entities',
        'text': text,
        'ents': [{'start': ent[0], 'end': ent[1], 'label': ent[2]} for ent in annotations['entities']]
    }
    
    # sorts the entities by start posision, otherwise the spacy visualiser duplicates
    # portions of the text in the output
    expected['ents'] = sorted(expected['ents'], key=itemgetter('start'))
    expected_len = len(expected['ents'])
    
    displacy.render(expected, jupyter=True, manual=True, style='ent')
    
    entities = {
        'title': 'Matched entities',
        'text': text,
        'ents':[{'start': ent.start_char, 'end': ent.end_char, 'label': ent.label_} for ent in doc.ents]
    }
    entities_len = len(entities['ents'])
    
    if doc.ents:
        doc.user_data['title'] = 'Matched entities'
        displacy.render(doc, jupyter=True, style='ent')
    else:
        displacy.render(entities, jupyter=True, manual=True, style='ent')
    
    matches = 0
    for ent in entities['ents']:
        if ent in expected['ents']:
            matches += 1
    
    total_true_positives += matches
    total_false_positives += (entities_len - matches)
    total_expected_entities += expected_len
    
    print('Matched', matches, 'out of', expected_len)
    print()

# precision is the number of correct results divided by the number of all returned results
print('% Precision', total_true_positives / (total_true_positives + total_false_positives))

# recall is the number of correct results divided by the number of results that should have been returned
print('% Recall', total_true_positives / total_expected_entities)

---


Matched 3 out of 3

---


Matched 0 out of 0

---


Matched 4 out of 5

---


Matched 0 out of 0

---


Matched 0 out of 0

---


Matched 6 out of 7

---


Matched 2 out of 2

---


Matched 4 out of 4

---


Matched 1 out of 3

---


Matched 0 out of 0

---


Matched 3 out of 4

---


Matched 1 out of 1

---


Matched 0 out of 0

---


Matched 0 out of 0

---


Matched 2 out of 2

---


Matched 0 out of 0

---


Matched 4 out of 6

---


Matched 0 out of 0

---


Matched 0 out of 0

---


Matched 3 out of 3

---


Matched 2 out of 2

---


Matched 0 out of 0

---


Matched 2 out of 2

---


Matched 0 out of 0

---


Matched 0 out of 0

---


Matched 0 out of 0

---


Matched 0 out of 0

---


Matched 1 out of 2

---


Matched 2 out of 2

---


Matched 2 out of 3

---


Matched 0 out of 0

---


Matched 1 out of 1

---


Matched 2 out of 3

---


Matched 0 out of 0

---


Matched 1 out of 3

---


Matched 1 out of 2

---


Matched 1 out of 3

---


Matched 3 out of 3

---


Matched 1 out of 2

---


Matched 0 out of 0

---


Matched 0 out of 0

---


Matched 0 out of 0

---


Matched 0 out of 0

---


Matched 3 out of 4

---


Matched 0 out of 0

---


Matched 6 out of 7

---


Matched 1 out of 1

---


Matched 0 out of 0

---


Matched 0 out of 0

---


Matched 0 out of 0

---


Matched 0 out of 0

---


Matched 0 out of 0

---


Matched 1 out of 3

---


Matched 2 out of 2

---


Matched 0 out of 0

---


Matched 0 out of 0

---


Matched 2 out of 2

---


Matched 0 out of 0

---


Matched 1 out of 4

---


Matched 0 out of 0

---


Matched 0 out of 0

% Precision 0.6868686868686869
% Recall 0.7472527472527473


## Save the model
Save the trained to disk.

In [143]:
if TRAIN_OUTPUT_DIR is not None:
    output_dir = Path(TRAIN_OUTPUT_DIR)
    
    if not output_dir.exists():
        os.makedirs(output_dir)

    nlp.meta['name'] = TRAIN_MODEL_NAME
    nlp.to_disk(output_dir)

    print('Saved model to', output_dir)

Saved model to models/el
