In [91]:
import json
import math
from pprint import pprint
import random
import spacy

from spacy.util import minibatch, compounding

In [92]:
# loads the training data
train_file = open('training/json/0748Dhahabi.TarikhIslam.Shamela0035100-ara1.csv.json')
data = json.load(train_file)
size_of_data = len(data)

pprint(size_of_data)
pprint(data[:3])

20107
[['الشأن وتلاحق المهاجرون الذين تأخروا بمكة بالنبي صلى الله عليه وسلم',
  {'entities': [[36, 40, 'LOC']]}],
 ['بن وائل السهمي والد عمرو بمكة على الكفر وكذلك أبو أحيحة',
  {'entities': [[25, 29, 'LOC']]}],
 ['وسلم حارثة وأبا رافع إلى مكة لينقلا بناته وسودة أم المؤمنين',
  {'entities': [[25, 28, 'LOC']]}]]


In [93]:
# randomises the data
random.shuffle(data)

In [94]:
# splits the data into training and testing sets
threshold = math.floor(size_of_data * .9)

train_data = data[:threshold]
test_data = data[threshold:]

In [96]:
# creates a blank model of greek, it doesn't recognise the ancient greek code grc
nlp = spacy.blank('ar')
ner = nlp.create_pipe('ner')
nlp.add_pipe(ner)

In [97]:
# adds the entity types we want to identify
for item in data:
    for entity in item[1]['entities']:
        ner.add_label(entity[2])

In [98]:
# initialises a pipeline for training
optimizer = nlp.begin_training()



In [99]:
# trains the model
for i in range(1):
    print('# Iteration', i)
    
    random.shuffle(train_data)
    losses = {}
    
    batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
    for batch in batches:
        texts, annotations = zip(*batch)
        nlp.update(
            texts,  # batch of texts
            annotations,  # batch of annotations
            drop=0.5,  # dropout - make it harder to memorise data
            losses=losses,
        )

    print('Losses', losses)

Iteration # 0
Losses {'ner': 144.85930563754022}
Iteration # 1
Losses {'ner': 5.258088229333444}
Iteration # 2
Losses {'ner': 3.588416440092322}
Iteration # 3
Losses {'ner': 1.071326352099852}
Iteration # 4
Losses {'ner': 2.293773869451867}


In [100]:
# tests the trained model
c_true_positives = 0
c_false_positives = 0
c_expected = 0

for text, entities in test_data:
    doc = nlp(text)
    
    expected = [
        (text[ent[0]:ent[1]], ent[2]) for ent in entities['entities']
    ]
    expected_len = len(expected)
    
    entities = [
        (ent.text, ent.label_) for ent in doc.ents
    ]
    entities_len = len(entities)
    
    print('#', text)
    print('Expected Entities', expected)
    print('Entities', entities)
    
    matches = 0
    for ent in entities:
        if ent in expected:
            matches += 1
    
    print('Matched', matches, 'out of', expected_len)
    
    c_true_positives += matches
    c_false_positives += (entities_len - matches)
    c_expected += len(expected)
    
    print()

# precision is the number of correct results divided by the number of all returned results
print('% Precision', c_true_positives / (c_true_positives + c_false_positives))

# recall is the number of correct results divided by the number of results that should have been returned
print('% Recall', c_true_positives / c_expected)

Text النهرفضلي البصري الوفاة ه نزيل بغداد شيخ صالح قرأ طرفا من
Expected Entities [('بغداد', 'LOC')]
Entities [('بغداد', 'LOC')]
1  out of  1

Text التميمي الأديب الوفاة ه توفي بهراة في رجب  صاعد بن
Expected Entities [('بهراة', 'LOC')]
Entities [('بهراة', 'LOC')]
1  out of  1

Text بن عتاب وكان أسند من بالأندلس في زمانه توفي في عاشر
Expected Entities [('بالأندلس', 'LOC')]
Entities [('بالأندلس', 'LOC')]
1  out of  1

Text وهؤلاء وأبا علي التستري وجماعة بالبصرة ثم سمع ببغداد ما لا
Expected Entities [('بالبصرة', 'LOC')]
Entities [('بالبصرة', 'LOC')]
1  out of  1

Text الطرسوسي الزاهد الوفاة ه نزيل دمشق روى عن الفضيل بن عياض
Expected Entities [('دمشق', 'LOC')]
Entities [('دمشق', 'LOC')]
1  out of  1

Text وأعجب به وأمره وبعثه إلى دمشق في سنة ست وأربع مائة
Expected Entities [('دمشق', 'LOC')]
Entities [('دمشق', 'LOC')]
1  out of  1

Text إبراهيم الجزري المعروف بالجبيلي اشتراه بدمشق ورباه ثم باعه بألف دينار
Expected Entities [('بدمشق', 'LOC')]
Entities [('بدمشق', 'LOC')]
1  out of  1

Text محم