In [91]:
import json
from pprint import pprint
import random
import spacy

In [92]:
# loads the training data
train_file = open('training/json/agathemerus.json')
data = json.load(train_file)

In [93]:
# randomises the data
random.shuffle(data)
pprint(data[:3])

[{'id': 0,
  'paragraphs': [{'sentences': [{'tokens': [{'ner': 'U-LOC',
                                             'orth': 'Εὐρώπης',
                                             'tag': '-'}]}]}]},
 {'id': 0,
  'paragraphs': [{'sentences': [{'tokens': [{'ner': 'U-LOC',
                                             'orth': 'Ἰόνιον',
                                             'tag': '-'}]}]}]},
 {'id': 0,
  'paragraphs': [{'sentences': [{'tokens': [{'ner': 'U-LOC',
                                             'orth': 'Εὔβοια',
                                             'tag': '-'}]}]}]}]


In [94]:
# splits the data into training and testing sets
train_data = data[:1700]
test_data = data[1700:]

In [95]:
# creates a blank model of greek, it doesn't recognise the ancient greek code grc
nlp = spacy.blank('el')
ner = nlp.create_pipe('ner')
nlp.add_pipe(ner)

In [96]:
# adds the entity types we want to identify
ner.add_label('U-LOC')
ner.add_label('U-PER')

In [97]:
# initialises a pipeline for training
optimizer = nlp.begin_training()



In [99]:
# trains the model
for i in range(3):
    print('Iteration #', i)
    
    random.shuffle(train_data)
    losses = {}
    
    for item in train_data:
        tokens = item['paragraphs'][0]['sentences'][0]['tokens'][0]
        # print(tokens)
        
        texts = tokens['orth']
        entity = tokens['ner']
        
        if entity == 'O':
            annotations = {'entities': []}
        else:
            annotations = {'entities': [(0, len(texts), tokens['ner'])]}
            
        # print(texts, annotations)
        
        nlp.update([texts], [annotations], sgd=optimizer, drop=0.35, losses=losses)
    
    print('Losses', losses)

Iteration # 0
Losses {'ner': 763.779519675833}
Iteration # 1
Losses {'ner': 600.2778975733311}
Iteration # 2
Losses {'ner': 424.4591039082176}


In [100]:
# tests the trained model
for item in test_data:
    tokens = item['paragraphs'][0]['sentences'][0]['tokens'][0]
    text = tokens['orth']
    expected = tokens['ner']
    
    doc = nlp(text)
    for ent in doc.ents:
        print('Entity in', text, ':', ent.label_, '| expected: ', expected)

Entity in Σκύλλαιον : U-LOC | expected:  U-LOC
Entity in Ἀλεξανδρείας : U-LOC | expected:  U-LOC
Entity in Ἰωνίας : U-LOC | expected:  U-LOC
Entity in προμήκης : U-LOC | expected:  O
Entity in Ἔφεσον : U-LOC | expected:  U-LOC
Entity in Ἀδρίαν : U-LOC | expected:  O
Entity in τὸν : U-LOC | expected:  O
Entity in τοῦ : U-LOC | expected:  O
Entity in Πόντον : U-LOC | expected:  U-LOC
Entity in Σαρδοῦς : U-LOC | expected:  U-LOC
Entity in τὴν : U-LOC | expected:  O
Entity in Μενίππου : U-LOC | expected:  U-PER
Entity in τῇ : U-LOC | expected:  O
Entity in ἄκρου : U-LOC | expected:  U-LOC
Entity in τῶν : U-LOC | expected:  O
Entity in Πόντου : U-LOC | expected:  U-LOC
Entity in Ἴσση : U-LOC | expected:  O
Entity in Μῆνιγξ : U-LOC | expected:  U-LOC
Entity in Ποπουλώνιον : U-LOC | expected:  U-LOC
Entity in Αἰλανίτου : U-LOC | expected:  U-LOC
Entity in τὸ : U-LOC | expected:  O
Entity in κόλπος : U-LOC | expected:  U-LOC
Entity in Ἰκάριον : U-LOC | expected:  U-LOC
Entity in Γάδειρα : U-LO