In [2]:
import json
import math
from pprint import pprint
import random
import spacy

from spacy.util import minibatch, compounding

In [3]:
# loads the training data
train_file = open('training/json/agathemerus-with-context.json')
data = json.load(train_file)
size_of_data = len(data)

pprint(size_of_data)
pprint(data[:3])

207
[['\ufeffἈγαθημέρου τοῦ Ὄρθωνος', {'entities': [[0, 11, 'PERSON']]}],
 ['γεωγραφίας ὑποτύπωσις', {'entities': []}],
 ['Ἀναξίμανδρος ὁ Μιλήσιος ἀκουστὴς Θάλεω πρῶτος ἀπετόλμησε',
  {'entities': [[33, 38, 'PERSON'], [15, 23, 'LOC'], [0, 12, 'PERSON']]}]]


In [4]:
# randomises the data
random.shuffle(data)

In [5]:
# splits the data into training and testing sets
threshold = math.floor(size_of_data * .9)

train_data = data[:threshold]
test_data = data[threshold:]

In [6]:
# creates a blank model of greek, it doesn't recognise the ancient greek code grc
nlp = spacy.blank('el')
ner = nlp.create_pipe('ner')
nlp.add_pipe(ner)

In [7]:
# adds the entity types we want to identify
for item in data:
    for entity in item[1]['entities']:
        ner.add_label(entity[2])

In [8]:
# initialises a pipeline for training
optimizer = nlp.begin_training()



In [15]:
# trains the model
for i in range(50):
    print('# Iteration', i)
    
    random.shuffle(train_data)
    losses = {}
    
    batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
    for batch in batches:
        texts, annotations = zip(*batch)
        nlp.update(
            texts,  # batch of texts
            annotations,  # batch of annotations
            drop=0.5,  # dropout - make it harder to memorise data
            losses=losses,
        )

    print('Losses', losses)

# Iteration 0
Losses {'ner': 1.4050549834951804}
# Iteration 1
Losses {'ner': 1.3783317436764868}
# Iteration 2
Losses {'ner': 0.6665153072894389}
# Iteration 3
Losses {'ner': 1.2605655137659277}
# Iteration 4
Losses {'ner': 0.8145239995546611}
# Iteration 5
Losses {'ner': 1.1997459446477223}
# Iteration 6
Losses {'ner': 1.215838536748899}
# Iteration 7
Losses {'ner': 1.7222738736500105}
# Iteration 8
Losses {'ner': 0.7559657691771285}
# Iteration 9
Losses {'ner': 1.353873784060288}
# Iteration 10
Losses {'ner': 1.8900729648110248}
# Iteration 11
Losses {'ner': 2.3023031160799925}
# Iteration 12
Losses {'ner': 1.7571170076376224}
# Iteration 13
Losses {'ner': 1.3474189799704797}
# Iteration 14
Losses {'ner': 1.6262811113198323}
# Iteration 15
Losses {'ner': 1.2040129964432376}
# Iteration 16
Losses {'ner': 0.738685608679565}
# Iteration 17
Losses {'ner': 0.4759150063357998}
# Iteration 18
Losses {'ner': 1.8740923620967769}
# Iteration 19
Losses {'ner': 1.9611413059622622}
# Iteration 2

In [18]:
# tests the trained model
c_true_positives = 0
c_false_positives = 0
c_expected = 0

for text, entities in test_data:
    doc = nlp(text)
    
    expected = [
        (text[ent[0]:ent[1]], ent[2]) for ent in entities['entities']
    ]
    expected_len = len(expected)
    
    entities = [
        (ent.text, ent.label_) for ent in doc.ents
    ]
    entities_len = len(entities)
    
    print('#', text)
    print('Expected Entities', expected)
    print('Entities', entities)
    
    matches = 0
    for ent in entities:
        if ent in expected:
            matches += 1
    
    print('Matched', matches, 'out of', expected_len)
    
    c_true_positives += matches
    c_false_positives += (entities_len - matches)
    c_expected += len(expected)
    
    print()

# precision is the number of correct results divided by the number of all returned results
print('% Precision', c_true_positives / (c_true_positives + c_false_positives))

# recall is the number of correct results divided by the number of results that should have been returned
print('% Recall', c_true_positives / c_expected)

# Χίου στάδια ν, ἐπὶ Μελανέα ἄκρον Λέσβου στάδια υν,  ἐπὶ Σίγριον
Expected Entities [('Λέσβου', 'LOC'), ('Σίγριον', 'LOC'), ('Χίου', 'LOC'), ('Μελανέα', 'LOC'), ('Σίγριον', 'LOC'), ('Μελανέα ἄκρον Λέσβου', 'LOC')]
Entities [('Χίου', 'LOC'), ('Μελανέα', 'LOC'), ('Λέσβου', 'LOC'), ('Σίγριον', 'LOC')]
Matched 4 out of 6

# καλοῦσι Κορινθιακὸν κόλπον ἤτοι Ἀλκυονίδα θάλασσαν. τὸ δὲ
Expected Entities [('Κορινθιακὸν κόλπον ἤτοι Ἀλκυονίδα θάλασσαν', 'LOC'), ('Ἀλκυονίδα', 'LOC'), ('Κορινθιακὸν κόλπον', 'LOC'), ('Ἀλκυονίδα θάλασσαν', 'LOC')]
Entities [('Κορινθιακὸν', 'LOC'), ('Ἀλκυονίδα θάλασσαν', 'LOC')]
Matched 1 out of 4

# ἐκβολῶν ἕως Ἡρακλείων στηλῶν σταδίων μυριάδων ἓξ καὶ ˏθψθ,
Expected Entities [('Ἡρακλείων στηλῶν', 'LOC'), ('στηλῶν', 'LOC'), ('Ἡρακλείων', 'PERSON')]
Entities [('Ἡρακλείων στηλῶν', 'LOC')]
Matched 1 out of 3

# πολύπειρος ἀνὴρ συνεῖδεν ὅτι προμήκης ἐστὶν ἡ γῆ ἡμιόλιον τὸ μῆκος
Expected Entities []
Entities []
Matched 0 out of 0

# σταδίων δισμυρίων, τοῦ δὲ στόματος ἔχει τ