In [1]:
# Pranay Agrawal
# Custom Named Entity Recognition Model in spacy

In [2]:
# List of training data. More data needs to be appended in the same formate as below.
train = [
         ("On January 26, 2020, about 0946 Pacific standard time, a Sikorsky S-76B helicopter, N72EX, entered a rapidly descending left turn and crashed into terrain in Calabasas, California", {"entities": [(3,18,"DATE"), (27,52,"TIME"), (57,88,"EQUIPMENT"), (101,153,"TYPE_ACCIDENT"),(158,178,"LOCATION")]}),
         ("The pilot and eight passengers died, and the helicopter was destroyed", {"entities": [(4,8,"AGENT"),(45,54,"EQUIPMENT"),(60,68,"EQUIPMENT_STATE")]}),
         ("The on-demand flight was operated by Island Express Helicopters Inc. (Island Express), Long Beach, California, under visual flight rules and the provisions of Title 14 Code of Federal Regulations Part 135", {"entities": [(37,67,"OPERATOR"),(159,203,"DOCUMENTATION")]}),
]

In [3]:
import spacy

In [4]:
# Creating my own nlp model to train the custom NER so as to avoid the 'Catastrophing Forgetting' of the pre-built model
nlp = spacy.blank('en')

In [5]:
# Check the already present pipes in the model (here it will be null since I built a blank model)
nlp.pipe_names

[]

In [6]:
# Add a NER pipe in the model
ner = nlp.create_pipe('ner')
nlp.add_pipe(ner)

In [7]:
# Adding my domain related annotations (label) in the ner pipeline
for text, annotations in train:
  for ent in annotations.get("entities"):
    ner.add_label(ent[2])

In [8]:
# Store pipelines other than 'ner' so as to ignore them during the training process (general case)
disable_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']

In [9]:
# Code for training process

from pathlib import Path
import random
from spacy.util import minibatch, compounding

with nlp.disable_pipes(*disable_pipes):
  optimizer = nlp.begin_training()

  for iteration in range(100):

    random.shuffle(train)
    losses = {}
    
    ## Divide training data in batches if there is a lot of training data with following 2 lines
    # batches = minibatch(train, size = compounding(1.0,4.0,1.001))
    # for batch in batches:

    for data in train:
      text, annotation = zip(data)
      nlp.update(text, annotation, drop = 0.5, losses = losses, sgd = optimizer)
      print("Lossess: ", losses)

Lossess:  {'ner': 9.442107206210494}
Lossess:  {'ner': 15.696365820243955}
Lossess:  {'ner': 40.59151098318398}
Lossess:  {'ner': 8.256133118644357}
Lossess:  {'ner': 33.04715936630964}
Lossess:  {'ner': 41.856491120532155}
Lossess:  {'ner': 8.43707790132612}
Lossess:  {'ner': 31.913808583281934}
Lossess:  {'ner': 40.202425139024854}
Lossess:  {'ner': 23.123097280040383}
Lossess:  {'ner': 30.899084135890007}
Lossess:  {'ner': 37.31248202268034}
Lossess:  {'ner': 16.80764506245032}
Lossess:  {'ner': 22.652341899927706}
Lossess:  {'ner': 27.359437748324126}
Lossess:  {'ner': 5.523990992456675}
Lossess:  {'ner': 14.459380786400288}
Lossess:  {'ner': 17.80390716297552}
Lossess:  {'ner': 2.1990169469208922}
Lossess:  {'ner': 2.360724396952719}
Lossess:  {'ner': 3.060508625501825}
Lossess:  {'ner': 0.5175591639890627}
Lossess:  {'ner': 0.623228963215297}
Lossess:  {'ner': 0.6956192496789129}
Lossess:  {'ner': 0.018126214200067103}
Lossess:  {'ner': 0.01813416402630784}
Lossess:  {'ner': 0.01

In [10]:
for text, _ in train:
  doc = nlp(text)
  print('Entities', [(ent.text, ent.label_) for ent in doc.ents])

Entities []
Entities []
Entities []


Increase the training data so that the model can look something and learn