In [0]:
# Installing dependencies
# Uncomment and run this cell once in the beginning
# ! pip install spacy google-cloud-vision pandas tqdm

In [0]:
# Importing libraries
import random
import re
import spacy
import os 
import json
from google.cloud import vision
import io
import pandas as pd
from tqdm import tqdm_notebook as tqdm

In [0]:
# JSON formatting functions
def convert_dataturks_to_spacy(dataturks_JSON_FilePath):
    try:
        training_data = []
        lines=[]
        with open(dataturks_JSON_FilePath, 'r') as f:
            lines = f.readlines()

        for line in lines:
            data = json.loads(line)
            text = data['content']
            entities = []
            data_annotations = data['annotation']
            if data_annotations is not None:
                for annotation in data_annotations:
                    #only a single point in text annotation.
                    point = annotation['points'][0]
                    labels = annotation['label']
                    # handle both list of labels or a single label.
                    if not isinstance(labels, list):
                        labels = [labels]

                    for label in labels:
                        point_start = point['start']
                        point_end = point['end']
                        point_text = point['text']
                        
                        lstrip_diff = len(point_text) - len(point_text.lstrip())
                        rstrip_diff = len(point_text) - len(point_text.rstrip())
                        if lstrip_diff != 0:
                            point_start = point_start + lstrip_diff
                        if rstrip_diff != 0:
                            point_end = point_end - rstrip_diff
                        entities.append((point_start, point_end + 1 , label))
            training_data.append((text, {"entities" : entities}))
        return training_data
    except Exception as e:
        logging.exception("Unable to process " + dataturks_JSON_FilePath + "\n" + "error = " + str(e))
        return None

def trim_entity_spans(data: list) -> list:
    """Removes leading and trailing white spaces from entity spans.

    Args:
        data (list): The data to be cleaned in spaCy JSON format.

    Returns:
        list: The cleaned data.
    """
    invalid_span_tokens = re.compile(r'\s')

    cleaned_data = []
    for text, annotations in data:
        entities = annotations['entities']
        valid_entities = []
        for start, end, label in entities:
            valid_start = start
            valid_end = end
            while valid_start < len(text) and invalid_span_tokens.match(
                    text[valid_start]):
                valid_start += 1
            while valid_end > 1 and invalid_span_tokens.match(
                    text[valid_end - 1]):
                valid_end -= 1
            valid_entities.append([valid_start, valid_end, label])
        cleaned_data.append([text, {'entities': valid_entities}])
    return cleaned_data        

In [0]:
################### Train Spacy NER.###########
def train_spacy():

    TRAIN_DATA = trim_entity_spans(convert_dataturks_to_spacy("SpacyData.json")) #train data
    nlp = spacy.blank('en')  # create blank Language class
    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last=True)
       

    # add labels
    for _, annotations in TRAIN_DATA:
         for ent in annotations.get('entities'):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()
        for itn in range(20):
            print("Starting iteration " + str(itn))
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in TRAIN_DATA:
                nlp.update(
                    [text],  # batch of texts
                    [annotations],  # batch of annotations
                    drop=0.3,  # dropout - make it harder to memorise data
                    sgd=optimizer,  # callable to update weights
                    losses=losses)
            print(losses)
            # Saving model to disk
            nlp.to_disk("NER")
    return nlp

In [0]:
# Training the NER model for 20 Epochs
nlp = train_spacy()

Starting iteration 0
{'ner': 1389.4218910541458}
Starting iteration 1
{'ner': 302.9351514141446}
Starting iteration 2
{'ner': 367.38948576275675}
Starting iteration 3
{'ner': 147.95892109721365}
Starting iteration 4
{'ner': 164.62000559092425}
Starting iteration 5
{'ner': 167.2583772462929}
Starting iteration 6
{'ner': 61.91851513966879}
Starting iteration 7
{'ner': 38.986973023105826}
Starting iteration 8
{'ner': 47.51857286418727}
Starting iteration 9
{'ner': 31.212716692883966}
Starting iteration 10
{'ner': 33.38858660239155}
Starting iteration 11
{'ner': 20.69545877822283}
Starting iteration 12
{'ner': 30.322275450144378}
Starting iteration 13
{'ner': 21.912833313412005}
Starting iteration 14
{'ner': 10.607650553790785}
Starting iteration 15
{'ner': 25.74186658124544}
Starting iteration 16
{'ner': 16.49051592265769}
Starting iteration 17
{'ner': 20.084303738077946}
Starting iteration 18
{'ner': 11.082623067535678}
Starting iteration 19
{'ner': 9.83011691064422}


In [0]:
# Zipping model folder
! zip -r NER.zip NER/

  adding: NER/ (stored 0%)
  adding: NER/vocab/ (stored 0%)
  adding: NER/vocab/key2row (stored 0%)
  adding: NER/vocab/lexemes.bin (deflated 69%)
  adding: NER/vocab/vectors (deflated 45%)
  adding: NER/vocab/strings.json (deflated 68%)
  adding: NER/meta.json (deflated 41%)
  adding: NER/tokenizer (deflated 83%)
  adding: NER/ner/ (stored 0%)
  adding: NER/ner/cfg (deflated 47%)
  adding: NER/ner/model (deflated 7%)
  adding: NER/ner/moves (deflated 74%)
