In [81]:
# Compute preexcisting SpaCy NER model

import spacy
import re
from spacy.language import Language
spacy.prefer_gpu()

nlp = spacy.load('en_core_web_sm')

# Correct for more precise bounaries
boundary = re.compile('^[0-9]$')

@Language.component("component")
def custom_seg(doc):
    prev = doc[0].text
    length = len(doc)
    for index, token in enumerate(doc):
        if (token.text == '.' and boundary.match(prev) and index!=(length - 1)):
            doc[index+1].sent_start = False
        prev = token.text
    return doc
    
nlp.add_pipe("component", before='parser')

# Getting the pipeline NER component
ner=nlp.get_pipe("ner")

print(nlp.pipe_names)

['tok2vec', 'tagger', 'component', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']


In [82]:
# training custom NER annotated data

TRAIN_DATA = [
              ("Walmart is a leading e-commerce company", {"entities": [(0, 7, "ORG")]}),
              ("I reached Chennai yesterday.", {"entities": [(19, 28, "GPE")]}),
              ("I recently ordered a book from Amazon", {"entities": [(24,32, "ORG")]}),
              ("I was driving a BMW", {"entities": [(16,19, "PRODUCT")]}),
              ("I ordered this from ShopClues", {"entities": [(20,29, "ORG")]}),
              ("Fridge can be ordered in Amazon ", {"entities": [(0,6, "PRODUCT")]}),
              ("I bought a new Washer", {"entities": [(16,22, "PRODUCT")]}),
              ("I bought a old table", {"entities": [(16,21, "PRODUCT")]}),
              ("I bought a fancy dress", {"entities": [(18,23, "PRODUCT")]}),
              ("I rented a camera", {"entities": [(12,18, "PRODUCT")]}),
              ("I rented a tent for our trip", {"entities": [(12,16, "PRODUCT")]}),
              ("I rented a screwdriver from our neighbour", {"entities": [(12,22, "PRODUCT")]}),
              ("I repaired my computer", {"entities": [(15,23, "PRODUCT")]}),
              ("I got my clock fixed", {"entities": [(16,21, "PRODUCT")]}),
              ("I got my truck fixed", {"entities": [(16,21, "PRODUCT")]}),
              ("Flipkart started it's journey from zero", {"entities": [(0,8, "ORG")]}),
              ("I recently ordered from Max", {"entities": [(24,27, "ORG")]}),
              ("Flipkart is recognized as leader in market",{"entities": [(0,8, "ORG")]}),
              ("I recently ordered from Swiggy", {"entities": [(24,29, "ORG")]})
              ]

# Adding labels to the `ner`

for _, annotations in TRAIN_DATA:
  for ent in annotations.get("entities"):
    ner.add_label(ent[2])

In [90]:
# Training a extended NER model
# Disable pipeline components you don't need to change
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

# Import requirements
import random
from spacy.util import minibatch, compounding

# TRAINING THE MODEL
nlp = spacy.blank('en')
if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe('ner', last=True)
else:
    ner = nlp.get_pipe('ner')

for _, annotations in TRAIN_DATA:
    for label in annotations['entities']:
        ner.add_label(label[2])

# TRAINING THE MODEL
from spacy.training import Example        
        
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.begin_training()
    for epoch in range(10):
        random.shuffle(TRAIN_DATA)
        losses = {}
        print(f'Epoch {epoch+1} of {10}:')
        for text, annotations in TRAIN_DATA:
            doc = nlp.make_doc(text)
            example = Example.from_dict(doc, annotations)
            nlp.update([example], drop=0.2, sgd=optimizer, losses=losses) #SGD
        print(losses) #Print losses after each epoch

Epoch 1 of 10:
{'ner': 42.39633257508103}
Epoch 2 of 10:
{'ner': 10.42556143421367}
Epoch 3 of 10:
{'ner': 24.5927018098386}
Epoch 4 of 10:
{'ner': 5.70484348848452}
Epoch 5 of 10:
{'ner': 3.607417439913781}
Epoch 6 of 10:
{'ner': 1.483973307798065}
Epoch 7 of 10:
{'ner': 1.9861611010679505}
Epoch 8 of 10:
{'ner': 0.2707683855519303}
Epoch 9 of 10:
{'ner': 0.002296802585233395}
Epoch 10 of 10:
{'ner': 4.471921552840087e-06}


In [91]:
# Testing the model
doc = nlp("I was driving a Alto")
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])

Entities [('Alto', 'PRODUCT')]


In [92]:
# Save the  model to directory
from pathlib import Path
output_dir = Path('./model/')
nlp.to_disk(output_dir)
print("Saved model to directory", output_dir)

Saved model to directory model


In [93]:
# Load the saved model and predict
print("Loading from model directory", output_dir)
nlp_updated = spacy.load(output_dir)
doc = nlp_updated("Fridge can be ordered in FlipKart" )
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])

Loading from model directory model
Entities [('Fridge', 'PRODUCT'), ('FlipKart', 'ORG')]


In [94]:
# Train NER from a blank spacy model
import spacy
import re
from spacy.language import Language
spacy.prefer_gpu()

nlp=spacy.blank("en")


# Correct for more precise bounaries
boundary = re.compile('^[0-9]$')

@Language.component("component")
def custom_seg(doc):
    prev = doc[0].text
    length = len(doc)
    for index, token in enumerate(doc):
        if (token.text == '.' and boundary.match(prev) and index!=(length - 1)):
            doc[index+1].sent_start = False
        prev = token.text
    return doc
    
nlp.add_pipe("component")

ner = nlp.add_pipe('ner')
nlp.begin_training()

print(nlp.pipe_names)


['component', 'ner']
