In [294]:
import pandas as pd
import spacy, random
from pathlib import Path
from toolz import partition_all
from spacy import *
from spacy.gold import GoldParse

In [295]:
### Load spaCy 'en' model
nlp = spacy.load('en')

In [296]:
### Load unprocessed texts, apply nlp() to each text
### then store into a list called docs.
df = pd.read_json('train_data.json')
docs = [nlp.make_doc(text) for text in df['content'][:6]]

In [297]:
### Create a list of list of tuples called tuned_ents,
### then store it as a JSON file. Each tuple represents a 
### fine-tuned entity for each diven doc. Each inner list
### represents all the fine-tuned entities in each doc.
tuned_ents = []
tuned_ents.append([(0, 7, 'GPE'), (9, 20, 'GPE'), (66, 78, 'LOC_TYPE'), (82, 86, 'GPE'), (90, 97, 'GPE'),
                   (98, 106, 'LOC_TYPE'), (123, 134, 'GPE'), (152, 169, 'STATE'), (205, 207, 'CARDINAL'),
                   (208, 213, 'CASE'), (217, 223, 'DISEASE'), (227, 249, 'DATE'), (251, 257, 'DISEASE'),
                   (384, 388, 'GPE'), (415, 431, 'DATE'), (437, 443, 'CASE'), (447, 452, 'CASE'), (466, 475, 'DATE'),
                   (480, 501, 'CHANGE'), (518, 525, 'CASE'), (538, 552, 'DATE'), (556, 570, 'DATE'),
                   (712, 739, 'STATE'), (913, 921, 'CARDINAL'), (929, 931, 'CARDINAL'), (932, 941, 'LOC_TYPE'),
                   (943, 951, 'LOC_TYPE'), (960, 964, 'LOC_TYPE'), (1154, 1159, 'DATE'), (1165, 1185, 'ORG'),
                   (1187, 1190, 'ORG'), (1201, 1206, 'PERCENT'), (1207, 1215, 'CHANGE'), (1219, 1225, 'DISEASE'),
                   (1226, 1231, 'CASE'), (1239, 1246, 'LOC_TYPE'), (1255, 1261, 'CARDINAL'), (1262, 1267, 'CASE'),
                   (1271, 1296, 'DATE'), (1302, 1308, 'CASE'), (1317, 1320, 'CARDINAL'), (1324, 1339, 'GPE'),
                   (1346, 1356, 'GPE'), (1378, 1383, 'CARDINAL'), (1392, 1428, 'GPE'), (1430, 1434, 'GPE'),
                   (1456, 1459, 'CARDINAL'), (1677, 1683, 'CASE'), (1687, 1693, 'DISEASE'), (1694, 1699, 'CASE'),
                   (1711, 1740, 'DATE')
                  ])
tuned_ents.append([(0, 6, 'GPE'), (8, 19, 'GPE'), (26, 32, 'CASE'), (36, 42, 'DISEASE'), (43, 48, 'CASE'),
                   (68, 83, 'LOC_TYPE'), (94, 100, 'CASE'), (104, 119, 'CARDINAL'), (131, 135, 'CASE'),
                   (139, 150, 'CARDINAL'), (179, 195, 'DATE'), (201, 207, 'CASE'), (237, 259, 'DISEASE'),
                   (263, 269, 'GPE'), (270, 279, 'DATE'), (291, 294, 'PERCENT'), (295, 301, 'CHANGE'),
                   (307, 332, 'DATE'), (407, 413, 'DISEASE'), (414, 424, 'CASE'), (428, 450, 'DATE'),
                   (464, 469, 'CHANGE'), (483, 489, 'CASE'), (494, 511, 'DATE'), (524, 532, 'STATE'),
                   (536, 548, 'LOC_TYPE'), (748, 756, 'STATE'), (859, 865, 'DISEASE'), (1123, 1134, 'CARDINAL'),
                   (1618, 1624, 'DISEASE'), (1661, 1672, 'GPE'), (1684, 1690, 'CASE'), (1694, 1703, 'CASE'),
                   (1704, 1713, 'DATE'), (1717, 1731, 'CHANGE'), (1732, 1742, 'LOC_TYPE'), (1756, 1763, 'CHANGE'),
                   (1775, 1780, 'LOC_TYPE'), (1789, 1795, 'GPE')
                  ])
tuned_ents.append([(0, 10, 'GPE'), (12, 23, 'GPE'), (37, 42, 'CARDINAL'), (43, 49, 'DISEASE'), (50, 55, 'CASE'),
                   (80, 115, 'DATE'), (120, 128, 'CHANGE'), (132, 146, 'PERCENT'), (152, 163, 'DATE'),
                   (174, 178, 'DATE'), (183, 202, 'ORG'), (225, 228, 'CARDINAL'), (229, 234, 'CASE'),
                   (242, 257, 'DATE'), (316, 326, 'GPE'), (329, 342, 'ORG'), (375, 381, 'CASE'), (385, 394, 'CASE'),
                   (395, 404, 'DATE'), (425, 444, 'DATE'), (446, 451, 'CARDINAL'), (452, 457, 'CASE'),
                   (470, 472, 'CARDINAL'), (473, 479, 'DISEASE'), (480, 487, 'CASE'), (493, 501, 'LOC_TYPE'),
                   (518, 525, 'DATE'), (625, 631, 'DISEASE'), (660, 674, 'DATE'), (1292, 1309, 'ORG'),
                   (1319, 1335, 'ORG'), (1482, 1498, 'ORG'), (1534, 1542, 'LOC_TYPE'), (1550, 1554, 'LOC_TYPE')
                  ])
tuned_ents.append([(0, 6, 'GPE'), (8, 19, 'GPE'), (30, 35, 'GPE'), (36, 42, 'LOC_TYPE'), (70, 78, 'CHANGE'),
                   (82, 88, 'DISEASE'), (89, 94, 'CASE'), (138, 142, 'CHANGE'), (146, 150, 'PERCENT'),
                   (151, 160, 'DATE'), (166, 186, 'ORG'), (190, 199, 'GPE'), (239, 244, 'CARDINAL'),
                   (245, 251, 'DISEASE'), (252, 257, 'CASE'), (258, 288, 'DATE'), (290, 292, 'CHANGE'),
                   (298, 308, 'CARDINAL'), (309, 314, 'CASE'), (322, 347, 'DATE'), (508, 511, 'PERCENT'),
                   (519, 524, 'CARDINAL'), (525, 530, 'CASE'), (541, 546, 'GPE'), (583, 592, 'LOC_TYPE'), 
                   (597, 602, 'LOC_TYPE'), (604, 615, 'CARDINAL'), (616, 620, 'CASE'), (624, 630, 'DISEASE'),
                   (638, 642, 'LOC_TYPE'), (661, 685, 'ORG'), (757, 768, 'CHANGE'), (788, 809, 'DATE'),
                   (831, 835, 'PERCENT'), (837, 845, 'CHANGE'), (850, 863, 'STATE'), (882, 892, 'LOC_TYPE'),
                   (935, 940, 'CASE'), (948, 961, 'STATE'), (1080, 1090, 'LOC_TYPE'), (1226, 1234, 'STATE')])
tuned_ents.append([(0, 7, 'GPE'), (9, 20, 'GPE'), (25, 31, 'DISEASE'), (32, 40, 'STATE'), (57, 72, 'GPE'),
                   (76, 91, 'DATE'), (131, 132, 'CARDINAL'), (133, 143, 'CASE'), (149, 150, 'CARDINAL'), 
                   (151, 161, 'CASE'), (169, 170, 'CARDINAL'), (181, 189, 'LOC_TYPE'), (300, 314, 'DATE'),
                   (316, 321, 'CASE'), (331, 337, 'CHANGE'), (343, 361, 'ORG'), (376, 379, 'CARDINAL'),
                   (380, 385, 'CASE'), (392, 399, 'DATE'), (421, 422, 'CARDINAL'), (423, 433, 'CASE'),
                   (468, 476, 'STATE'), (504, 507, 'CARDINAL'), (508, 513, 'CASE'), (523, 525, 'CARDINAL'),
                   (526, 531, 'CASE'), (550, 556, 'DISEASE'), (560, 570, 'GPE'), (811, 814, 'CARDINAL'),
                   (911, 926, 'DATE'), (936, 940, 'LOC_TYPE'), (944, 958, 'GPE'), (965, 977, 'LOC_TYPE'),
                   (981, 985, 'GPE'), (989, 996, 'GPE'), (997, 1005, 'LOC_TYPE'), (1019, 1036, 'STATE'),
                   (1070, 1072, 'CARDINAL'), (1073, 1078, 'CASE'), (1082, 1088, 'DISEASE'), (1097, 1114, 'DATE'),
                   (1119, 1144, 'DATE'), (1150, 1167, 'ORG'), (1177, 1183, 'CARDINAL'), (1184, 1189, 'CASE'),
                   (1193, 1199, 'DISEASE'), (1200, 1210, 'LOC_TYPE'), (1215, 1221, 'DATE'), (1223, 1230, 'GPE'),
                   (1384, 1390, 'DATE'), (1396, 1420, 'ORG'), (1436, 1439, 'CARDINAL'), (1440, 1445, 'CASE'),
                   (1449, 1455, 'DISEASE'), (1463, 1471, 'LOC_TYPE'), (1478, 1485, 'DATE'), (1492, 1493, 'CARDINAL'),
                   (1494, 1500, 'CASE'), (1504, 1517, 'GPE'), (1532, 1538, 'DISEASE'), (1539, 1544, 'DISEASE'),
                   (1545, 1550, 'CASE')])
tuned_ents.append([(0, 6, 'GPE'), (8, 19, 'GPE'), (26, 46, 'ORG'), (48, 51, 'ORG'), (61, 68, 'DATE'),
                   (113, 119, 'CARDINAL'), (130, 136, 'DISEASE'), (137, 142, 'CASE'), (146, 157, 'LOC_TYPE'),
                   (158, 190, 'DATE'), (200, 205, 'PERCENT'), (206, 212, 'CHANGE'), (225, 248, 'DATE'),
                   (250, 256, 'CARDINAL'), (283, 286, 'CARDINAL'), (294, 298, 'CASE'), (300, 306, 'DISEASE'),
                   (454, 479, 'ORG'), (485, 496, 'GPE'), (620, 637, 'ORG'), (651, 657, 'DISEASE'),
                   (658, 663, 'CASE'), (667, 678, 'LOC_TYPE'), (689, 699, 'GPE'), (701, 706, 'PERCENT'),
                   (709, 724, 'GPE'), (726, 730, 'PERCENT'), (733, 750, 'GPE'), (753, 757, 'PERCENT'),
                   (759, 774, 'GPE'), (779, 791, 'GPE'), (793, 797, 'PERCENT'), (804, 817, 'GPE'),
                   (819, 823, 'PERCENT'), (836, 839, 'CARDINAL'), (861, 867, 'DISEASE'), (868, 873, 'CASE'),
                   (878, 888, 'LOC_TYPE'), (890, 896, 'DISEASE'), (897, 902, 'CASE'), (1102, 1117, 'GPE'),
                   (1141, 1147, 'CASE'), (1151, 1157, 'CASE'), (1158, 1167, 'DATE'), (1174, 1176, 'CARDINAL'),
                   (1186, 1192, 'CASE'), (1199, 1207, 'DATE'), (1213, 1216, 'ORG'), (1247, 1249, 'CARDINAL'),
                   (1250, 1259, 'LOC_TYPE'), (1261, 1269, 'LOC_TYPE'), (1274, 1285, 'LOC_TYPE'),
                   (1289, 1295, 'DISEASE'), (1309, 1318, 'STATE'), (1331, 1335, 'LOC_TYPE'), (1351, 1360, 'CARDINAL'),
                   (1361, 1367, 'DISEASE'), (1368, 1375, 'CASE'), (1522, 1528, 'DISEASE'), (1529, 1537, 'STATE'),
                   (1547, 1554, 'LOC_TYPE')])

In [298]:
#### Check for NERs in each doc.
#displacy.render(docs[0], style='ent', jupyter=True)
#docs[0]

In [299]:
### Add labels in tuned_ents to the model
for text_ents in tuned_ents:
    for ents in text_ents:
        nlp.get_pipe('ner').add_label(ents[2])

In [300]:
### Create a list of tuples called crude_data. Apply nlp() to
### Each tuple is made up of a text, and a GoldParse object
### of the text itself.
crude_data = []
for i in range(6):
    crude_data.append((docs[i], GoldParse(docs[i])))
    #print(GoldParse(docs[i]).ner)
#crude_data = [(doc, GoldParse(doc)) for doc in docs]

In [301]:
### Create a list of tuples called tuned_data. The tuple's
### GoldParse object uses the fine-tuned entities found in
### the list callen tuned_ents.
tuned_data = []
for i in range(6):
#for i in range(len(docs)):
    gold = GoldParse(docs[i], entities=tuned_ents[i])
    tuned_data.append((docs[i], gold))
    #print(gold.ner)

In [302]:
### Train the model using crude_data and tuned_data.
n_epoch = 20
batch_size = 10
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.begin_training()
    for i in range(n_epoch):
        data = crude_data + tuned_data
        losses = {}
        for batch in partition_all(batch_size, data):
            docs, golds = zip(*batch)
            nlp.update(docs, golds, sgd=optimizer, drop=0.35, losses=losses)

In [303]:
### Save new model
new_model = 'fassster_news'
out_dir = 'models'
nlp.meta['name'] = new_model
nlp.to_disk(out_dir)
print('Saved model' + new_model + ' to ' + out_dir)

Saved modelfassster_news to models


In [304]:
### Load new model
print('Loading model from', out_dir)
nlp2 = spacy.load(out_dir)

Loading model from models


In [305]:
test = nlp2('Dengue cases in Negros Occidental have reached an epidemic threshold. While 24 deaths have been reported in Negros Occidental – 8 of which were in Bacolod City – since January of this year, the provincial government has yet to declare a state of calamity.')
displacy.render(test, style='ent', jupyter=True)

In [103]:
train_data = [
    ('The number reached 744 in Cagayan Valley.', {
        'entities': [(4, 10, 'CASE'), (19, 22, 'CARDINAL'), (26, 40, 'GPE')]
    }),
    ('ISABELA, Philippines – The rainy season has just started.', {
        'entities': [(0, 7, 'GPE'), (9, 20, 'GPE')]
    }),
    ('The municipality of Luna in Isabela province in the northern Philippines already declared state of calamity after their health office recorded 54 cases of dengue in just the month of June.', {
        'entities': [(20, 24, 'GPE'), (28, 35, 'GPE'), (61, 72, 'GPE'), (90, 107, 'STATE'), (143, 145, 'CARDINAL'), (146, 151, 'CASE'), (165, 187, 'DATE')]
    })
]
test_data = [
    ('The number reached 744 in Cagayan Valley, while Calabarzon had the highest with 3,778 and the Autonomous Region of Muslim Mindanao (ARMM) had the lowest with 312.')
]

In [31]:
def train(model=None, new_model='fassster_news', output_dir=None, n_epoch=100):
    """Load the model, set up the pipeline, and train the entity recognizer."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")

    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
    else:
        ner = nlp.get_pipe('ner')

    # add labels
    for _, annotations in train_data:
        for ent in annotations.get('entities'):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()
        for epoch in range(n_epoch):
            random.shuffle(train_data)
            losses = {}
            for text, annotations in train_data:
                nlp.update(
                    [text],  # batch of texts
                    [annotations],  # batch of annotations
                    drop=0.3,  # dropout - make it harder to memorise data
                    sgd=optimizer,  # callable to update weights
                    losses=losses)
            #print(losses)

    # test the trained model
    for text, _ in train_data:
        doc = nlp(text)
        displacy.render(doc, style='ent', jupyter=True)
        #print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
        #print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta['name'] = new_model  # rename model
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        for test in test_data:
            doc2 = nlp2(test)
            displacy.render(doc2, style='ent', jupyter=True)

In [34]:
train(None,'fassster_news', 'models', 100)

Created blank 'en' model


Saved model to models
Loading from models


In [33]:
nlp = spacy.load('en')
for test in test_data:
    doc = nlp(test)
    displacy.render(doc, style='ent', jupyter=True)

In [108]:
nlp = spacy.load('en')
revision_data = []
revision_texts = [
    ('I love my baby Niely so much.'),
    ('He is my bouncing baby boy.')
]
fine_tune_data = [
    ('I love my baby Niely so much.', {
        'entities': [(15, 20, 'PERSON')]
    }),
    ('He is my bouncing baby boy.', {
        'entities': [(0, 2, 'PERSON')]
    })
]

In [109]:
### This function basically applies nlp() to text found in revision_texts,
### puts the attributes into GoldParse objects, then
### appends them to revision_data
for doc in nlp.pipe(revision_texts):
    tags = [w.tag_ for w in doc]
    heads = [w.head.i for w in doc]
    deps = [w.dep_ for w in doc]
    entities = [(e.start_char, e.end_char, e.label_) for e in doc.ents]
    revision_data.append((doc, GoldParse(doc, tags=tags, heads=heads,
                                         deps=deps, entities=entities)))

In [110]:
n_epoch = 5
batch_size = 32
for i in range(n_epoch):
    examples = revision_data + fine_tune_data
    losses = {}
    random.shuffle(examples)
    for batch in partition_all(batch_size, examples):
        docs, golds = zip(*batch)
        nlp.update(docs, golds, losses=losses)

In [114]:
for x in revision_texts:
    print(displacy.render(nlp(x), style='dep', jupyter=True))

None


None


In [28]:
nlp = spacy.load('en')
df = pd.read_json('train_data.json')
docs = [text for text in df['content']]

In [37]:
raw_data = []
gold_data = []

In [38]:
for doc in nlp.pipe(docs):
    tags = [w.tag_ for w in doc]
    heads = [w.head.i for w in doc]
    deps = [w.dep_ for w in doc]
    entities = [(e.start_char, e.end_char, e.label_) for e in doc.ents]
    raw_data.append((doc, GoldParse(doc, tags=tags, heads=heads, deps=deps, entities=entities)))

In [81]:
displacy.render(raw_data[0][0], style='ent', jupyter=True)

In [58]:
for e in raw_data[0][0].ents:
    print(e.start_char, e.end_char, e.label_)

0 7 GPE
9 20 GPE
82 86 PERSON
90 97 GPE
123 134 GPE
205 207 CARDINAL
227 249 DATE
305 310 GPE
366 382 PERSON
384 388 PERSON
415 431 DATE
466 475 DATE
538 552 DATE
556 570 DATE
807 816 ORG
895 911 DATE
913 921 CARDINAL
929 931 CARDINAL
990 999 ORG
1154 1159 DATE
1161 1185 ORG
1201 1206 PERCENT
1255 1261 CARDINAL
1271 1296 DATE
1317 1320 CARDINAL
1324 1338 LOC
1346 1356 GPE
1378 1383 CARDINAL
1388 1428 GPE
1430 1434 DATE
1456 1459 CARDINAL
1461 1471 ORG
1533 1534 CARDINAL
1711 1740 DATE
