In [1]:
import json
#Converting JSON1 files to Spacy tuples format
def convert_doccano_to_spacy(filepath):
    with open(filepath, 'rb') as fp:
        data = fp.readlines()
    training_data = []
    for record in data:
        entities = []
        read_record = json.loads(record)
        text = read_record['text']
        entities_record = read_record['labels']
        for start, end, label in entities_record:
            entities.append((start, end, label))
            training_data.append((text, {"entities": entities}))
    return training_data

In [2]:
data = convert_doccano_to_spacy('D:/text_annotation_files/file2.json1')

In [3]:
import re


def trim_entity_spans(data: list) -> list:
    """Removes leading and trailing white spaces from entity spans.

    Args:
        data (list): The data to be cleaned in spaCy JSON format.

    Returns:
        list: The cleaned data.
    """
    invalid_span_tokens = re.compile(r'\s')

    cleaned_data = []
    for text, annotations in data:
        entities = annotations['entities']
        valid_entities = []
        for start, end, label in entities:
            valid_start = start
            valid_end = end
            while valid_start < len(text) and invalid_span_tokens.match(
                    text[valid_start]):
                valid_start += 1
            while valid_end > 1 and invalid_span_tokens.match(
                    text[valid_end - 1]):
                valid_end -= 1
            valid_entities.append([valid_start, valid_end, label])
        cleaned_data.append([text, {'entities': valid_entities}])

    return cleaned_data

In [4]:
cleaned_data = trim_entity_spans(data)

In [5]:
import spacy
import random
def train_spacy(data,iterations):
    TRAIN_DATA = data
    nlp = spacy.blank('en')  # create blank Language class
    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last=True)
    # add labels
    for _, annotations in TRAIN_DATA:
         for ent in annotations.get('entities'):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()
        for itn in range(iterations):
            print("Statring iteration " + str(itn))
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in TRAIN_DATA:
                nlp.update(
                    [text],  # batch of texts
                    [annotations],  # batch of annotations
                    drop=0.2,  # dropout - make it harder to memorise data
                    sgd=optimizer,  # callable to update weights
                    losses=losses)
            print(losses)
    return nlp


prdnlp = train_spacy(cleaned_data, 100)

Statring iteration 0
{'ner': 24375.440353158163}
Statring iteration 1
{'ner': 8542.466504179269}
Statring iteration 2
{'ner': 5246.99096208254}
Statring iteration 3
{'ner': 3900.1009312471783}
Statring iteration 4
{'ner': 3238.0743097955883}
Statring iteration 5
{'ner': 2556.0306660632154}
Statring iteration 6
{'ner': 2498.5380593417763}
Statring iteration 7
{'ner': 2211.6291276322463}
Statring iteration 8
{'ner': 2196.5983538754776}
Statring iteration 9
{'ner': 1825.7198870501657}
Statring iteration 10
{'ner': 1703.2877072041144}
Statring iteration 11
{'ner': 1676.2768710651883}
Statring iteration 12
{'ner': 1528.2309858093463}
Statring iteration 13
{'ner': 1453.5087314661846}
Statring iteration 14
{'ner': 1347.5584247506588}
Statring iteration 15
{'ner': 1416.6139585074538}
Statring iteration 16
{'ner': 1221.009520820261}
Statring iteration 17
{'ner': 1229.48957112208}
Statring iteration 18
{'ner': 1113.3993561759946}
Statring iteration 19
{'ner': 1152.7580142172098}
Statring iterati

In [6]:
# Save our trained Model
modelfile = input("Enter your Model Name: ")
prdnlp.to_disk(modelfile)

#Test your text
test_text = input("Enter your testing text: ")
doc = prdnlp(test_text)
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Enter your Model Name: ShoryaV1
Enter your testing text: as an ndi account rep(user) i want to see all my saved orders as well as saved orders my assigned customers has created, so that i can view, edit, delete or submit to an order to help customer with quick order placement 
ndi account rep(user 6 26 Keyphrase
order placement 204 219 Keyphrase
