In [1]:
import os
import re
import logging
import json
import random
import spacy
from sklearn.model_selection import train_test_split
from spacy.util import minibatch, compounding
# from seqeval import metrics


def trim_entity_spans(data: list) -> list:
    """Removes leading and trailing white spaces from entity spans.

    Args:
        data (list): The data to be cleaned in spaCy JSON format.

    Returns:
        list: The cleaned data.
    """
    invalid_span_tokens = re.compile(r'\s')

    cleaned_data = []
    for text, annotations in data:
        entities = annotations['entities']

        valid_entities = []
        for start, end, label in entities:
            valid_start = start
            valid_end = end
            while valid_start < len(text) and invalid_span_tokens.match(
                    text[valid_start]):
                valid_start += 1
            while valid_end > 1 and invalid_span_tokens.match(
                    text[valid_end - 1]):
                valid_end -= 1
            valid_entities.append([valid_start, valid_end, label])
        cleaned_data.append([text, {'entities': valid_entities}])

    return cleaned_data


def convert_doccano_to_spacy(doccano_JSON_FilePath):
    try:
        training_data = []
        lines = []
        with open(doccano_JSON_FilePath, 'r') as f:
            lines = f.readlines()

        for line in lines:
            data = json.loads(line)
            text = data['text']
            entities = []

            for start, end, label in data['labels']:

                if label not in ['LOC']:
                    entities.append((start, end, label))
                # for annotation in data['labels']:
                # labels = annotation
                # entities.append(labels)
            training_data.append((text, {"entities": entities}))

        return training_data

    except Exception as e:
        logging.exception("Unable to process " +
                          doccano_JSON_FilePath + "\n" + "error = " + str(e))
    return None


In [None]:
# !tar -cvzf healthy.tar.gz 

In [2]:
def train_test_data():
    filepath = 'data_ciat_spacy/ciat_final.json'
    TRAIN_DATA_Converted = convert_doccano_to_spacy(filepath)

    TRAINING_DATA = trim_entity_spans(TRAIN_DATA_Converted)

    training_data, testing_data = train_test_split(TRAINING_DATA, test_size=0.2, random_state=1)
    # print(len(train_data))
    # print(len(test_data))
    return training_data, testing_data



In [5]:
train_data, test_data = train_test_data()

In [9]:
def train_spacy(model, training_data, n_epochs=100):
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")
        # create the built-in pipeline components and add them to the pipeline
        # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
    else:
        ner = nlp.get_pipe("ner")

    # add labels
    for _, annotations in training_data:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    with nlp.disable_pipes(*other_pipes):  # only train NER
        # reset and initialize the weights randomly – but only if we're
        # training a new model
        if model is None:
            nlp.begin_training()
        for itn in range(n_epochs):
            random.shuffle(training_data)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(training_data, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                        texts,  # batch of texts
                        annotations,  # batch of annotations
                        drop=0.5,  # dropout - make it harder to memorise data
                        losses=losses,
                    )
            print("epoch: {} Losses: {}".format(itn, str(losses)))
    return nlp

In [10]:
trained_model = train_spacy(None, train_data)

Created blank 'en' model
epoch: 0 Losses: {'ner': 15462.890112189918}
epoch: 1 Losses: {'ner': 2546.6429027457125}
epoch: 2 Losses: {'ner': 2086.971809047973}
epoch: 3 Losses: {'ner': 1958.0862018113694}
epoch: 4 Losses: {'ner': 1713.496927947439}
epoch: 5 Losses: {'ner': 1560.2137369937657}
epoch: 6 Losses: {'ner': 1372.1907283872551}
epoch: 7 Losses: {'ner': 1248.0370129434523}
epoch: 8 Losses: {'ner': 1136.9904694945835}
epoch: 9 Losses: {'ner': 1126.5190596497246}
epoch: 10 Losses: {'ner': 1183.9630931121596}
epoch: 11 Losses: {'ner': 1130.966873584017}
epoch: 12 Losses: {'ner': 1120.210019589765}
epoch: 13 Losses: {'ner': 1009.7791569257392}
epoch: 14 Losses: {'ner': 928.9640845634722}
epoch: 15 Losses: {'ner': 942.1700517380768}
epoch: 16 Losses: {'ner': 916.6272038035324}
epoch: 17 Losses: {'ner': 869.6987377356625}
epoch: 18 Losses: {'ner': 882.1699191863568}
epoch: 19 Losses: {'ner': 765.5053489962635}
epoch: 20 Losses: {'ner': 822.5741010019942}
epoch: 21 Losses: {'ner': 748.

###Training approach 2

In [None]:
# new_labels = set()
# for entry in train_data:
#     for label in entry[1]["entities"]:
#         new_labels.add(label[2])
# new_labels

In [None]:
# import spacy
# import random

# nlp = spacy.blank('en')
# ner = nlp.create_pipe('ner')
# nlp.add_pipe(ner)


# for label in new_labels:
#     ner.add_label(label)   # add new entity label to entity recognizer

In [None]:
# %%time
# from spacy.util import minibatch

# optimizer = nlp.begin_training()
# n_iter = 100
# for itn in range(n_iter):
#     # i = 0
#     random.shuffle(train_data)
#     losses = {}
#     batches = minibatch(train_data, 100)
#     for batch in batches:
#         texts, annotations = zip(*batch)
#         nlp.update(texts, annotations, sgd=optimizer, drop=0.5, losses=losses)
#         # if i%100 == 0: 
#         #     print(losses)
#         # i += 1
#     print("Loss after epoch", itn, "=", losses["ner"])

###End of training approach- 2 

In [11]:
from spacy.gold import GoldParse
from spacy.scorer import Scorer

nlp = trained_model
scorer = Scorer()
for input_, annot in test_data:
    doc_gold_text = nlp.make_doc(input_)
    gold = GoldParse(doc_gold_text, entities=annot["entities"])
    pred_value = nlp(input_)
    scorer.score(pred_value, gold)
print(scorer.scores)

{'uas': 0.0, 'las': 0.0, 'las_per_type': {'': {'p': 0.0, 'r': 0.0, 'f': 0.0}}, 'ents_p': 78.18181818181819, 'ents_r': 63.23529411764706, 'ents_f': 69.91869918699187, 'ents_per_type': {'Disease': {'p': 60.0, 'r': 54.54545454545454, 'f': 57.14285714285713}, 'GPE': {'p': 88.57142857142857, 'r': 67.3913043478261, 'f': 76.5432098765432}}, 'tags_acc': 0.0, 'token_acc': 100.0, 'textcat_score': 0.0, 'textcats_per_cat': {}}


In [None]:
# !rm -R out/

In [12]:
# save model to output directory
from datetime import datetime
from pathlib import Path
dt = datetime.now().strftime("%d-%m-%Y.%H-%M")


output_dir = Path("out_ciat")
if not output_dir.exists():
    output_dir.mkdir()
nlp.meta['name'] = "NER " + dt # rename model
nlp.to_disk(output_dir)
print("Saved model to", output_dir)

Saved model to out_ciat


In [13]:
len(train_data)

62

In [14]:
nlp2 = spacy.load("out_ciat")
doc2 = nlp2(test_data[2][0])
for ent in doc2.ents:
    print(ent.label_, ent.text)
for e in test_data[2][1]["entities"]:
    print(e)

Disease Potato mop-top
Disease Potato mop-top
GPE Spongospora
GPE Illumina
GPE Chiloé Province
GPE the United States
GPE near Chiloé Province
[16, 30, 'Disease']
[40, 45, 'GPE']
[48, 62, 'Disease']
[625, 630, 'GPE']
[1579, 1594, 'GPE']
[2741, 2746, 'GPE']
[2869, 2884, 'GPE']
[3050, 3055, 'GPE']
[599, 614, 'GPE']
[579, 585, 'GPE']
[1142, 1148, 'GPE']


In [16]:
# !zip -r out_ciat.zip out_ciat

  adding: out_ciat/ (stored 0%)
  adding: out_ciat/vocab/ (stored 0%)
  adding: out_ciat/vocab/strings.json (deflated 69%)
  adding: out_ciat/vocab/vectors (deflated 45%)
  adding: out_ciat/vocab/key2row (stored 0%)
  adding: out_ciat/vocab/lexemes.bin (deflated 73%)
  adding: out_ciat/ner/ (stored 0%)
  adding: out_ciat/ner/cfg (deflated 47%)
  adding: out_ciat/ner/model (deflated 7%)
  adding: out_ciat/ner/moves (deflated 56%)
  adding: out_ciat/meta.json (deflated 37%)
  adding: out_ciat/tokenizer (deflated 82%)
