In [72]:
import json
import math
from pprint import pprint
import random
import spacy

from spacy.util import minibatch, compounding

In [66]:
# loads the training data
train_file = open('training/json/agathemerus-with-context.json')
data = json.load(train_file)
size_of_data = len(data)

pprint(size_of_data)
pprint(data[:3])

207
[['\ufeffἈγαθημέρου τοῦ Ὄρθωνος', {'entities': [[0, 11, 'PERSON']]}],
 ['γεωγραφίας ὑποτύπωσις', {'entities': []}],
 ['Ἀναξίμανδρος ὁ Μιλήσιος ἀκουστὴς Θάλεω πρῶτος ἀπετόλμησε',
  {'entities': [[33, 38, 'PERSON'], [15, 23, 'LOC'], [0, 12, 'PERSON']]}]]


In [67]:
# randomises the data
random.shuffle(data)

In [68]:
# splits the data into training and testing sets
threshold = math.floor(size_of_data * .9)

train_data = data[:threshold]
test_data = data[threshold:]

In [69]:
# creates a blank model of greek, it doesn't recognise the ancient greek code grc
nlp = spacy.blank('el')
ner = nlp.create_pipe('ner')
nlp.add_pipe(ner)

In [70]:
# adds the entity types we want to identify
for item in data:
    for entity in item[1]['entities']:
        ner.add_label(entity[2])

In [71]:
# initialises a pipeline for training
optimizer = nlp.begin_training()



In [83]:
# trains the model
for i in range(10):
    print('Iteration #', i)
    
    random.shuffle(train_data)
    losses = {}
    
    batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
    for batch in batches:
        texts, annotations = zip(*batch)
        nlp.update(
            texts,  # batch of texts
            annotations,  # batch of annotations
            drop=0.5,  # dropout - make it harder to memorise data
            losses=losses,
        )

    print('Losses', losses)

Iteration # 0
Losses {'ner': 36.76341152791397}
Iteration # 1
Losses {'ner': 31.145264529279427}
Iteration # 2
Losses {'ner': 27.73465433365283}
Iteration # 3
Losses {'ner': 26.40244953376762}
Iteration # 4
Losses {'ner': 23.300564097653822}
Iteration # 5
Losses {'ner': 21.538866541027687}
Iteration # 6
Losses {'ner': 19.943257923933714}
Iteration # 7
Losses {'ner': 19.792641800868452}
Iteration # 8
Losses {'ner': 16.62473737404465}
Iteration # 9
Losses {'ner': 15.372801822989823}


In [88]:
# tests the trained model
c_matches = 0
c_expected = 0

for text, entities in test_data:
    doc = nlp(text)
    expected = [
        (text[ent[0]:ent[1]], ent[2]) for ent in entities['entities']
    ]
    entities = [
        (ent.text, ent.label_) for ent in doc.ents
    ]
    
    print('Text', text)
    print('Expected Entities', expected)
    print('Entities', entities)
    
    matches = 0
    for ent in entities:
        if ent in expected:
            matches += 1
    
    print(matches, ' out of ', len(expected))
    
    c_matches += matches
    c_expected += len(expected)
    
    print()

print('Accuracy', c_matches/c_expected)

Text ἔστι δὲ ἡ μεγάλη Σύρτις σταδίων ˏε, ἡ δὲ μικρὰ σταδίων ˏαχ. τὸ δὲ
Expected Entities [('ἡ μεγάλη Σύρτις', 'LOC'), ('ἡ δὲ μικρὰ', 'LOC')]
Entities [('Σύρτις', 'LOC')]
0  out of  2

Text κίρκιον ὑπὸ τῶν περιοίκων ὀνομαζόμενον. ἔθνη δὲ οἰκεῖν τὰ πέρατα
Expected Entities [('ἔθνη', 'LOC')]
Entities [('τῶν περιοίκων ὀνομαζόμενον. ἔθνη', 'LOC')]
0  out of  1

Text πάλιν δὲ ἀπ’ ἀρχῆς Εὐρώπης καὶ Λιβύης Ἰβηρικὸν τὸ ἀπὸ Στηλῶν
Expected Entities [('Ἰβηρικὸν', 'LOC'), ('Ἰβηρικὸν τὸ ἀπὸ Στηλῶν', 'LOC'), ('Στηλῶν', 'LOC'), ('Εὐρώπης', 'LOC'), ('Λιβύης', 'LOC')]
Entities [('Εὐρώπης', 'LOC'), ('Λιβύης', 'LOC'), ('Ἰβηρικὸν', 'LOC'), ('Στηλῶν', 'LOC')]
4  out of  5

Text πελάγει νῆσοι Πιτυοῦσσαι ἡ μείζων καὶ οἰκουμένη σταδίων τ μῆκος,
Expected Entities [('ἡ μείζων', 'LOC'), ('Πιτυοῦσσαι', 'LOC'), ('νῆσοι Πιτυοῦσσαι', 'LOC')]
Entities [('Πιτυοῦσσαι', 'LOC'), ('οἰκουμένη', 'LOC')]
1  out of  3

Text ˏϛωκ οὕτως· ἀπὸ Μυριάνδρου ἐπὶ Κλεῖδας Κύπρου σταδίων  χιλίων
Expected Entities [('Κλεῖδας Κύπρου', 'LO