In [0]:
from __future__ import unicode_literals, print_function
from pathlib import Path
from tqdm import tqdm 

import numpy as np
import pandas as pd
import spacy
import re
import plac 
import random
import spacy
import string
import json

In [0]:
def mergeIntervals(intervals):
    sorted_by_lower_bound = sorted(intervals, key=lambda tup: tup[0])
    merged = []

    for higher in sorted_by_lower_bound:
        if not merged:
            merged.append(higher)
        else:
            lower = merged[-1]
            if higher[0] <= lower[1]:
                if lower[2] is higher[2]:
                    upper_bound = max(lower[1], higher[1])
                    merged[-1] = (lower[0], upper_bound, lower[2])
                else:
                    if lower[1] > higher[1]:
                        merged[-1] = lower
                    else:
                        merged[-1] = (lower[0], higher[1], higher[2])
            else:
                merged.append(higher)

    return merged

In [0]:
def trim_entity_spans(data: list) -> list:
    """Removes leading and trailing white spaces from entity spans.

    Args:
        data (list): The data to be cleaned in spaCy JSON format.

    Returns:
        list: The cleaned data.
    """
    invalid_span_tokens = re.compile(r'\s')

    cleaned_data = []
    for text, annotations in data:
        entities = annotations['entities']
        valid_entities = []
        for start, end, label in entities:
            valid_start = start
            valid_end = end
            while valid_start < len(text) and invalid_span_tokens.match(
                    text[valid_start]):
                valid_start += 1
            while valid_end > 1 and invalid_span_tokens.match(
                    text[valid_end - 1]):
                valid_end -= 1
            valid_entities.append([valid_start, valid_end, label])
        cleaned_data.append([text, {'entities': valid_entities}])

    return cleaned_data

def convert_dataturks_to_spacy(dataturks_JSON_FilePath):
    try:
        training_data = []
        lines = []
        with open(dataturks_JSON_FilePath, 'r', encoding="utf8") as f:
            lines = f.readlines()

        for line in lines:
            data = json.loads(line)
            text = data['content']
            entities = []
            if data['annotation'] is not None:
                for annotation in data['annotation']:
                    point = annotation['points'][0]
                    labels = annotation['label']
                    if not isinstance(labels, list):
                        labels = [labels]

                    for label in labels:
                        entities.append((
                            point['start'],
                            point['end'] + 1,
                            label
                        ))

            training_data.append((text, {"entities": mergeIntervals(entities)}))
        return training_data
    except Exception:
        logging.exception("Unable to process " + dataturks_JSON_FilePath)
        return None


TRAIN_DATA = trim_entity_spans(convert_dataturks_to_spacy("/content/drive/My Drive/traindata.json"))

In [0]:
model=None
random.seed(0)
if model is not None:
    nlp = spacy.load(model) 
    print("Loaded model '%s'" % model)
else:
    nlp = spacy.blank("en") 
    print("Created blank 'en' model")

if "ner" not in nlp.pipe_names:
    print("Creating new pipe")
    ner = nlp.create_pipe("ner")
    nlp.add_pipe(ner, last=True)

else:
    ner = nlp.get_pipe("ner")

iters=60

for _, annotations in TRAIN_DATA:
    for ent in annotations.get('entities'):
        ner.add_label(ent[2])

  
move_names = list(ner.move_names)
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
with nlp.disable_pipes(*other_pipes):  # only train NER
    optimizer = nlp.begin_training()
    for itn in range(iters):
        random.shuffle(TRAIN_DATA)
        losses = {}
        for text, annotations in TRAIN_DATA:
            nlp.update(
                [text], 
                [annotations],  
                drop=0.4,  
                 sgd=optimizer,  
                losses=losses)
        print("Iteration " + str(itn),"Losses", losses)


Created blank 'en' model
Creating new pipe
Iteration 0 Losses {'ner': 72881.84172887275}
Iteration 1 Losses {'ner': 56274.57866312674}
Iteration 2 Losses {'ner': 49364.31046622158}
Iteration 3 Losses {'ner': 45600.37251938517}
Iteration 4 Losses {'ner': 40238.84745400952}
Iteration 5 Losses {'ner': 40765.96137773611}
Iteration 6 Losses {'ner': 36079.835796331994}
Iteration 7 Losses {'ner': 33979.621763201256}
Iteration 8 Losses {'ner': 36937.14377678442}
Iteration 9 Losses {'ner': 36304.54675359326}
Iteration 10 Losses {'ner': 34965.55834843795}
Iteration 11 Losses {'ner': 33993.4809869439}
Iteration 12 Losses {'ner': 31454.66553217383}
Iteration 13 Losses {'ner': 31362.75079856239}
Iteration 14 Losses {'ner': 31617.293582180206}
Iteration 15 Losses {'ner': 30012.023781760192}
Iteration 16 Losses {'ner': 29134.73870886322}
Iteration 17 Losses {'ner': 28835.70033996148}
Iteration 18 Losses {'ner': 29160.606647147368}
Iteration 19 Losses {'ner': 29578.627700421486}
Iteration 20 Losses {'

In [0]:
nlp.to_disk('/content/drive/My Drive/pyre_model/')