In [91]:
import json
from pprint import pprint
import random
import spacy

In [92]:
# loads the training data
train_file = open('training/json/agathemerus.json')
data = json.load(train_file)

In [93]:
# randomises the data
random.shuffle(data)
pprint(data[:3])

[{'id': 0,
  'paragraphs': [{'sentences': [{'tokens': [{'ner': 'U-LOC',
                                             'orth': 'Εὐρώπης',
                                             'tag': '-'}]}]}]},
 {'id': 0,
  'paragraphs': [{'sentences': [{'tokens': [{'ner': 'U-LOC',
                                             'orth': 'Ἰόνιον',
                                             'tag': '-'}]}]}]},
 {'id': 0,
  'paragraphs': [{'sentences': [{'tokens': [{'ner': 'U-LOC',
                                             'orth': 'Εὔβοια',
                                             'tag': '-'}]}]}]}]


In [94]:
# splits the data into training and testing sets
train_data = data[:1700]
test_data = data[1700:]

In [95]:
# creates a blank model of greek, it doesn't recognise the ancient greek code grc
nlp = spacy.blank('el')
ner = nlp.create_pipe('ner')
nlp.add_pipe(ner)

In [96]:
# adds the entity types we want to identify
ner.add_label('U-LOC')
ner.add_label('U-PER')

In [97]:
# initialises a pipeline for training
optimizer = nlp.begin_training()



In [88]:
# trains the model
for i in range(3):
    print('Iteration #', i)
    
    random.shuffle(train_data)
    losses = {}
    
    for item in train_data:
        tokens = item['paragraphs'][0]['sentences'][0]['tokens'][0]
        # print(tokens)
        
        texts = tokens['orth']
        entity = tokens['ner']
        
        if entity == 'O':
            annotations = {'entities': []}
        else:
            annotations = {'entities': [(0, len(texts), tokens['ner'])]}
            
        # print(texts, annotations)
        
        nlp.update([texts], [annotations], sgd=optimizer, drop=0.35, losses=losses)
    
    print('Losses', losses)

Iteration # 0
Losses {'ner': 705.9919161181741}
Iteration # 1
Losses {'ner': 623.3987554933689}
Iteration # 2
Losses {'ner': 424.8161793689791}


In [90]:
# tests the trained model
for item in test_data:
    tokens = item['paragraphs'][0]['sentences'][0]['tokens'][0]
    text = tokens['orth']
    expected = tokens['ner']
    
    doc = nlp(text)
    for ent in doc.ents:
        print('Entity in', text, ':', ent.label_, '| expected: ', expected)

Entity in Ῥόδου : U-LOC | expected:  U-LOC
Entity in Κόρκυραν : U-LOC | expected:  U-LOC
Entity in νεώτεροι : U-LOC | expected:  O
Entity in καλοῦσί : U-LOC | expected:  O
Entity in ὅσον : U-LOC | expected:  O
Entity in Κιλικίας : U-LOC | expected:  U-LOC
Entity in ἕως : U-LOC | expected:  O
Entity in κόλπῳ : U-LOC | expected:  U-LOC
Entity in τῆς : U-LOC | expected:  O
Entity in οἰκεῖν : U-LOC | expected:  O
Entity in Αἰθιοπίαν, : U-LOC | expected:  U-LOC
Entity in μεγάλαι : U-LOC | expected:  O
Entity in πνέουσιν : U-LOC | expected:  O
Entity in Γάγγου : U-LOC | expected:  U-LOC
Entity in Πελώρου : U-LOC | expected:  U-LOC
Entity in τῆς : U-LOC | expected:  O
Entity in ὄρη : U-LOC | expected:  U-LOC
Entity in Ταύρου : U-LOC | expected:  U-LOC
Entity in θάλασσαν : U-LOC | expected:  U-LOC
Entity in Μαιῶτιν : U-LOC | expected:  O
Entity in παρὰ : U-LOC | expected:  O
Entity in ποταμοῦ : U-LOC | expected:  U-LOC
Entity in Κάραμβιν : U-LOC | expected:  U-LOC
Entity in Κορυναῖον : U-LOC |