In [1]:
import en_core_web_lg
import pandas as pd
import re
import random
import spacy
from spacy.util import minibatch, compounding
import warnings
import matplotlib.pyplot as plt
import re
import os
import numpy as np
import json
import os

In [2]:
spacy.__version__

'3.3.0'

The annotations were manually done with an online annotator tool. Data with other entities can also be used using the same format of annotation. 

In [3]:
annotation_file = 'annotations.json'
f = open(annotation_file)
TRAIN_DATA = json.load(f)

In [4]:
print("Annotations of first 5 sentences \n")
for annotation in TRAIN_DATA["annotations"][:5]:
    print("Sentence: ",annotation[0])
    print("Entities:", annotation[1],'\n')

Annotations of first 5 sentences 

Sentence:  this mix makes the most delicious pancakes! my gluten eating family wanted mine instead of theirs! i have also make sausage balls and biscuits for biscuits and gravy with it! so good!
Entities: {'entities': [[34, 42, 'FOOD'], [116, 123, 'FOOD'], [134, 142, 'FOOD'], [147, 155, 'FOOD'], [160, 165, 'FOOD']]} 

Sentence:  lemon juice and fruit concentrates mingle with the sparkling water to create a delicate, smooth, slightly tingly and delicious drink
Entities: {'entities': [[0, 11, 'FOOD'], [16, 21, 'FOOD'], [61, 66, 'FOOD'], [127, 132, 'FOOD']]} 

Sentence:  it only takes two cans of chicken broth to one package of quinoa to make a lot of very tasty and filling finished product which can be used with gravy, sauce, mixed vegetables or anything you like
Entities: {'entities': [[26, 39, 'FOOD'], [58, 64, 'FOOD'], [145, 150, 'FOOD'], [152, 157, 'FOOD'], [165, 175, 'FOOD']]} 

Sentence:  my role is to greet them with a tasty treat - usually a gra

Training data were converted into the docBin format for spacyv3

In [5]:
from spacy.tokens import DocBin
from tqdm import tqdm
import numpy as np

nlp = spacy.load('en_core_web_lg')
db = DocBin()

#Shuffling to avoid continuation bias
np.random.shuffle(TRAIN_DATA['annotations'])

for text, annot in tqdm(TRAIN_DATA['annotations']): 
    doc = nlp.make_doc(text) 
    ents = []
    for start, end, label in annot["entities"]:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    doc.ents = ents 
    db.add(doc)

db.to_disk("./training_data.spacy")

100%|██████████| 295/295 [00:00<00:00, 1100.80it/s]


In [6]:
nlp = spacy.load('en_core_web_lg')

# get ner pipe as model only needs NER purpose
ner = nlp.get_pipe("ner")
for label in ner.labels:
    print(label,spacy.explain(label))

CARDINAL Numerals that do not fall under another type
DATE Absolute or relative dates or periods
EVENT Named hurricanes, battles, wars, sports events, etc.
FAC Buildings, airports, highways, bridges, etc.
GPE Countries, cities, states
LANGUAGE Any named language
LAW Named documents made into laws.
LOC Non-GPE locations, mountain ranges, bodies of water
MONEY Monetary values, including unit
NORP Nationalities or religious or political groups
ORDINAL "first", "second", etc.
ORG Companies, agencies, institutions, etc.
PERCENT Percentage, including "%"
PERSON People, including fictional
PRODUCT Objects, vehicles, foods, etc. (not services)
QUANTITY Measurements, as of weight or distance
TIME Times smaller than a day
WORK_OF_ART Titles of books, songs, etc.


Generate config file with code

In [None]:
! python -m spacy init config config.cfg --lang en --pipeline ner --optimize efficiency

# Training

In [7]:
! python -m spacy train config.cfg --output ./ --paths.train ./training_data.spacy --paths.dev ./training_data.spacy

ℹ Saving to output directory: .
ℹ Using CPU
ℹ To switch to GPU 0, use the option: --gpu-id 0
[1m
✔ Initialized pipeline
[1m
ℹ Pipeline: ['tok2vec', 'ner']
ℹ Initial learn rate: 0.001
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     34.67    1.13    1.15    1.11    0.01
  2     200         60.09   1815.91   90.64   91.41   89.89    0.91
  4     400         88.19    457.87   95.84   95.97   95.71    0.96
  8     600        139.00    253.97   98.74   99.44   98.06    0.99
 12     800        165.13    180.05   98.83   98.35   99.31    0.99
 17    1000        184.10    150.08   99.38   99.45   99.31    0.99
 24    1200        254.02    145.66   99.44  100.00   98.89    0.99
 32    1400        198.06    139.17   99.51   99.72   99.31    1.00
 41    1600        106.56    105.93   99.51   99.72   99.31    1.00
 53    1800        163.98    130.35   99.51   99.58   99.45    1.00

[2022-06-21 02:54:20,585] [INFO] Set up nlp object from config
[2022-06-21 02:54:20,597] [INFO] Pipeline: ['tok2vec', 'ner']
[2022-06-21 02:54:20,602] [INFO] Created vocabulary
[2022-06-21 02:54:20,619] [INFO] Finished initializing nlp object
[2022-06-21 02:54:21,461] [INFO] Initialized pipeline components: ['tok2vec', 'ner']



330    4400        176.83    165.67   99.51   99.86   99.17    1.00
352    4600         68.13    146.18   99.51   99.72   99.31    1.00
374    4800         63.03    137.64   99.52   99.45   99.58    1.00
✔ Saved pipeline to output directory
model-last


### At first the best model from training was selected

In [15]:
best_model_directory = 'model-best'
nlp_food = spacy.load(best_model_directory)
ner_food = nlp_food.get_pipe("ner")

## Catastrophic Forgetting
This model suffers from the catastrophic forgetting problem and can only detect 'FOOD' or the custom trained entity. It can be seen from results on a test file

In [9]:
test_file_name = 'test_sentences.txt'
with open(test_file_name,'r') as file:
    TEST_TEXTS = file.read().split('\n')[:-1]

In [16]:
for text in TEST_TEXTS:
    doc = nlp_food(text)
    spacy.displacy.render(doc, style="ent", jupyter=True)

It can be observed that the model is only detecting food entities. To solve the problem, the 'ner' pipe of the custom trained model and the initial model need to be combined

In [17]:
nlp = spacy.load('en_core_web_lg')
ner = nlp.get_pipe("ner")

nlp_food.replace_listeners("tok2vec", "ner", ["model.tok2vec"])
nlp.add_pipe('ner', source = nlp_food , name="food_ner",before = 'ner')



<spacy.pipeline.ner.EntityRecognizer at 0x25489cbfc30>

In [18]:
for text in TEST_TEXTS:
    doc = nlp(text)
    spacy.displacy.render(doc, style="ent", jupyter=True)

# Finally the nlp model can successfully detect all kinds of entities

<spacy.lang.en.English at 0x1a01165e260>