https://www.machinelearningplus.com/nlp/training-custom-ner-model-in-spacy/

In [1]:
# Compute pre-excisting SpaCy NER model

import spacy
import re 
from spacy.language import Language
spacy.prefer_gpu()

nlp = spacy.load('en_core_web_sm')

# Correct for more precise bounaries
boundary = re.compile('^[0-9]$')

@Language.component("component")
def custom_seg(doc):
    prev = doc[0].text
    length = len(doc)
    for index, token in enumerate(doc):
        if (token.text == '.' and boundary.match(prev) and index!=(length - 1)):
            doc[index+1].sent_start = False
        prev = token.text
    return doc
    
nlp.add_pipe("component", before='parser')

# Getting the pipeline NER component
ner=nlp.get_pipe("ner")

print(nlp.pipe_names)

['tok2vec', 'tagger', 'component', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']


In [2]:
# training custom NER annotated data

TRAIN_DATA = [
              ("Walmart is a leading e-commerce company", {"entities": [(0, 7, "ORG")]}),
              ("I reached Chennai yesterday.", {"entities": [(19, 28, "GPE")]}),
              ("I recently ordered a book from Amazon", {"entities": [(24,32, "ORG")]}),
              ("I was driving a BMW", {"entities": [(16,19, "PRODUCT")]}),
              ("I ordered this from ShopClues", {"entities": [(20,29, "ORG")]}),
              ("Fridge can be ordered in Amazon ", {"entities": [(0,6, "PRODUCT")]}),
              ("I bought a new Washer", {"entities": [(16,22, "PRODUCT")]}),
              ("I bought a old table", {"entities": [(16,21, "PRODUCT")]}),
              ("I bought a fancy dress", {"entities": [(18,23, "PRODUCT")]}),
              ("I rented a camera", {"entities": [(12,18, "PRODUCT")]}),
              ("I rented a tent for our trip", {"entities": [(12,16, "PRODUCT")]}),
              ("I rented a screwdriver from our neighbour", {"entities": [(12,22, "PRODUCT")]}),
              ("I repaired my computer", {"entities": [(15,23, "PRODUCT")]}),
              ("I got my clock fixed", {"entities": [(16,21, "PRODUCT")]}),
              ("I got my truck fixed", {"entities": [(16,21, "PRODUCT")]}),
              ("Flipkart started it's journey from zero", {"entities": [(0,8, "ORG")]}),
              ("I recently ordered from Max", {"entities": [(24,27, "ORG")]}),
              ("Flipkart is recognized as leader in market",{"entities": [(0,8, "ORG")]}),
              ("I recently ordered from Swiggy", {"entities": [(24,29, "ORG")]})
              ]

# Adding labels to the `ner`

for _, annotations in TRAIN_DATA:
  for ent in annotations.get("entities"):
    ner.add_label(ent[2])

In [9]:
# Training a extended NER model
# Disable pipeline components you don't need to change
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

# Import requirements
import random
from spacy.util import minibatch, compounding

# TRAINING THE MODEL
nlp = spacy.blank('en')
if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe('ner', last=True)
else:
    ner = nlp.get_pipe('ner')

for _, annotations in TRAIN_DATA:
    for label in annotations['entities']:
        ner.add_label(label[2])

# TRAINING THE MODEL
from spacy.training import Example        
        
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.begin_training()
    for epoch in range(10):
        random.shuffle(TRAIN_DATA)
        losses = {}
        print(f'Epoch {epoch+1} of {10}:')
        for text, annotations in TRAIN_DATA:
            doc = nlp.make_doc(text)
            example = Example.from_dict(doc, annotations)
            nlp.update([example], drop=0.2, sgd=optimizer, losses=losses) #SGD
        print(losses) #Print losses after each epoch

Epoch 1 of 10:
{'ner': 48.336134323239094}
Epoch 2 of 10:
{'ner': 12.61607434726426}
Epoch 3 of 10:
{'ner': 16.003298977755666}
Epoch 4 of 10:
{'ner': 4.932249405850584}
Epoch 5 of 10:
{'ner': 3.137065685928928}
Epoch 6 of 10:
{'ner': 1.8671225861252847}
Epoch 7 of 10:
{'ner': 1.1325932176187525}
Epoch 8 of 10:
{'ner': 0.08740469777323746}
Epoch 9 of 10:
{'ner': 0.0004492976952553009}
Epoch 10 of 10:
{'ner': 0.027222024830030427}


In [8]:
# Testing the model
doc = nlp("I was driving a Alto")
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])

Entities []


In [5]:
# Save the  model to directory
from pathlib import Path
output_dir = Path('./model/')
nlp.to_disk(output_dir)
print("Saved model to directory", output_dir)

Saved model to directory model


In [6]:
# Load the saved model and predict
print("Loading from model directory", output_dir)
nlp_updated = spacy.load(output_dir)
doc = nlp_updated("Fridge can be ordered in FlipKart" )
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])

Loading from model directory model
Entities [('Fridge', 'PRODUCT'), ('FlipKart', 'ORG')]


In [7]:
# EXAMPLE to train NER from a blank spacy model
import spacy
import re
from spacy.language import Language
spacy.prefer_gpu()

nlp=spacy.blank("en")


# Correct for more precise bounaries
boundary = re.compile('^[0-9]$')

@Language.component("component")
def custom_seg(doc):
    prev = doc[0].text
    length = len(doc)
    for index, token in enumerate(doc):
        if (token.text == '.' and boundary.match(prev) and index!=(length - 1)):
            doc[index+1].sent_start = False
        prev = token.text
    return doc
    
nlp.add_pipe("component")

ner = nlp.add_pipe('ner')
nlp.begin_training()

print(nlp.pipe_names)


['component', 'ner']


In [9]:
# EXAMPLE Training completely new entity type in spaCy

import spacy
import re
from spacy.language import Language
spacy.prefer_gpu()

nlp = spacy.load('en_core_web_sm')

# Correct for more precise bounaries
boundary = re.compile('^[0-9]$')

@Language.component("component")
def custom_seg(doc):
    prev = doc[0].text
    length = len(doc)
    for index, token in enumerate(doc):
        if (token.text == '.' and boundary.match(prev) and index!=(length - 1)):
            doc[index+1].sent_start = False
        prev = token.text
    return doc
    
nlp.add_pipe("component", before='parser')

# Getting the pipeline NER component
ner=nlp.get_pipe("ner")

print(nlp.pipe_names)

['tok2vec', 'tagger', 'component', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']


In [10]:
# New label to add
LABEL = "FOOD"

# Training examples in the required format
TRAIN_DATA =[ ("Pizza is a common fast food.", {"entities": [(0, 5, "FOOD")]}),
              ("Pasta is an italian recipe", {"entities": [(0, 5, "FOOD")]}),
              ("China's noodles are very famous", {"entities": [(8,14, "FOOD")]}),
              ("Shrimps are famous in China too", {"entities": [(0,7, "FOOD")]}),
              ("Lasagna is another classic of Italy", {"entities": [(0,7, "FOOD")]}),
              ("Sushi is extemely famous and expensive Japanese dish", {"entities": [(0,5, "FOOD")]}),
              ("Unagi is a famous seafood of Japan", {"entities": [(0,5, "FOOD")]}),
              ("Tempura , Soba are other famous dishes of Japan", {"entities": [(0,7, "FOOD")]}),
              ("Udon is a healthy type of noodles", {"entities": [(0,4, "ORG")]}),
              ("Chocolate soufflé is extremely famous french cuisine", {"entities": [(0,17, "FOOD")]}),
              ("Flamiche is french pastry", {"entities": [(0,8, "FOOD")]}),
              ("Burgers are the most commonly consumed fastfood", {"entities": [(0,7, "FOOD")]}),
              ("Burgers are the most commonly consumed fastfood", {"entities": [(0,7, "FOOD")]}),
              ("Frenchfries are considered too oily", {"entities": [(0,11, "FOOD")]})
           ]

In [11]:
# Add the new label to ner
ner.add_label(LABEL)

# Resume training
optimizer = nlp.resume_training()
move_names = list(ner.move_names)

# List of pipes you want to train
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]

# List of pipes which should remain unaffected in training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]


In [13]:
# Training a extended NER model
# Disable pipeline components you don't need to change
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

# Import requirements
import random
from spacy.util import minibatch, compounding

# TRAINING THE MODEL
nlp = spacy.blank('en')
if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe('ner', last=True)
else:
    ner = nlp.get_pipe('ner')

for _, annotations in TRAIN_DATA:
    for label in annotations['entities']:
        ner.add_label(label[2])

# TRAINING THE MODEL
from spacy.training import Example        
        
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.begin_training()
    for epoch in range(30):
        random.shuffle(TRAIN_DATA)
        losses = {}
        print(f'Epoch {epoch+1} of {30}:')
        for text, annotations in TRAIN_DATA:
            doc = nlp.make_doc(text)
            example = Example.from_dict(doc, annotations)
            nlp.update([example], drop=0.35, sgd=optimizer, losses=losses) #SGD
        print(losses) #Print losses after each epoch

Epoch 1 of 30:
{'ner': 53.49527291953564}
Epoch 2 of 30:
{'ner': 19.49522678235462}
Epoch 3 of 30:
{'ner': 11.43590301005572}
Epoch 4 of 30:
{'ner': 7.849987847871769}
Epoch 5 of 30:
{'ner': 3.7057412351612635}
Epoch 6 of 30:
{'ner': 3.625667381263947}
Epoch 7 of 30:
{'ner': 3.6103162439353977}
Epoch 8 of 30:
{'ner': 3.789772008193676}
Epoch 9 of 30:
{'ner': 2.12324987203493}
Epoch 10 of 30:
{'ner': 1.8972006927803966}
Epoch 11 of 30:
{'ner': 10.62688447760528}
Epoch 12 of 30:
{'ner': 2.904349904697667}
Epoch 13 of 30:
{'ner': 2.3366008946840586}
Epoch 14 of 30:
{'ner': 11.29514894845379}
Epoch 15 of 30:
{'ner': 1.396029188028388}
Epoch 16 of 30:
{'ner': 0.9264486979814582}
Epoch 17 of 30:
{'ner': 0.9526966321408676}
Epoch 18 of 30:
{'ner': 0.04170757135520367}
Epoch 19 of 30:
{'ner': 0.019767511927469346}
Epoch 20 of 30:
{'ner': 0.00032466260415628644}
Epoch 21 of 30:
{'ner': 0.3879618703161943}
Epoch 22 of 30:
{'ner': 0.00021439766538737927}
Epoch 23 of 30:
{'ner': 6.986041051827311e

In [14]:
# Testing the NER

test_text = "I ate Sushi yesterday. Maggi is a common fast food "
doc = nlp(test_text)
print("Entities in '%s'" % test_text)
for ent in doc.ents:
  print(ent)
#> Entities in 'I ate 

Entities in 'I ate Sushi yesterday. Maggi is a common fast food '
Sushi
Maggi


In [15]:
# test the trained model
for text, _ in TRAIN_DATA:
    doc = nlp(text)
    print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
    print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])

Entities [('Pizza', 'FOOD')]
Tokens [('Pizza', 'FOOD', 3), ('is', '', 2), ('a', '', 2), ('common', '', 2), ('fast', '', 2), ('food', '', 2), ('.', '', 2)]
Entities [('Unagi', 'FOOD')]
Tokens [('Unagi', 'FOOD', 3), ('is', '', 2), ('a', '', 2), ('famous', '', 2), ('seafood', '', 2), ('of', '', 2), ('Japan', '', 2)]
Entities [('Tempura', 'FOOD')]
Tokens [('Tempura', 'FOOD', 3), (',', '', 2), ('Soba', '', 2), ('are', '', 2), ('other', '', 2), ('famous', '', 2), ('dishes', '', 2), ('of', '', 2), ('Japan', '', 2)]
Entities [('Lasagna', 'FOOD')]
Tokens [('Lasagna', 'FOOD', 3), ('is', '', 2), ('another', '', 2), ('classic', '', 2), ('of', '', 2), ('Italy', '', 2)]
Entities [('Frenchfries', 'FOOD')]
Tokens [('Frenchfries', 'FOOD', 3), ('are', '', 2), ('considered', '', 2), ('too', '', 2), ('oily', '', 2)]
Entities []
Tokens [('China', '', 2), ("'s", '', 2), ('noodles', '', 2), ('are', '', 2), ('very', '', 2), ('famous', '', 2)]
Entities [('Chocolate soufflé', 'FOOD')]
Tokens [('Chocolate', 'FOO

In [16]:
# Output directory
from pathlib import Path
output_dir=Path('./model/')

In [17]:
# Saving the model to the output directory
if not output_dir.exists():
  output_dir.mkdir()
nlp.meta['name'] = 'my_ner'  # rename model
nlp.to_disk(output_dir)
print("Saved model to folder: ", output_dir)

Saved model to folder:  model


In [18]:
# Loading the model from the directory
print("Loading from folder:", output_dir)
nlp2 = spacy.load(output_dir)

Loading from folder: model


In [19]:
###assert nlp2.get_pipe("ner").move_names == move_names
doc2 = nlp2(' Dosa is an extremely famous south Indian dish')
for ent in doc2.ents:
  print(ent.label_, ent.text)

ORG Dosa
