> # Train Custom NER with Spacy 3.0

## Installing and Loading necessary Libraries

In [None]:
! pip install spacy==3.0.6

In [None]:
!python -m spacy download en_core_web_sm

In [20]:
import spacy

## Creation of Training Data

Here few texts are taken from news/review articles about aircrafts

In [21]:
TRAIN_DATA = [('The F15 aircraft uses a lot of fuel', {'entities': [(4, 7, 'aircraft')]}),
 ('did you see the F16 landing?', {'entities': [(16, 19, 'aircraft')]}),
 ('how many missiles can a F35 carry', {'entities': [(24, 27, 'aircraft')]}),
 ('is the F15 outdated', {'entities': [(7, 10, 'aircraft')]}),
 ('does the US still train pilots to dog fight?',
  {'entities': [(0, 0, 'aircraft')]}),
 ('how long does it take to train a F16 pilot',
  {'entities': [(33, 36, 'aircraft')]}),
 ('how much does a F35 cost', {'entities': [(16, 19, 'aircraft')]}),
 ('would it be possible to steal a F15', {'entities': [(32, 35, 'aircraft')]}),
 ('who manufactures the F16', {'entities': [(21, 24, 'aircraft')]}),
 ('how many countries have bought the F35',
  {'entities': [(35, 38, 'aircraft')]}),
 ('is the F35 a waste of money', {'entities': [(7, 10, 'aircraft')]})]

In [22]:
import pandas as pd
import os
from tqdm import tqdm
import spacy
from spacy.tokens import DocBin

#nlp = spacy.blank("en") # load a new spacy model
nlp = spacy.load("en_core_web_sm") # load other spacy model

## Converting training data to spacy format

In [23]:


db = DocBin() # create a DocBin object

for text, annot in tqdm(TRAIN_DATA): # data in previous format
    doc = nlp.make_doc(text) # create doc object from text
    ents = []
    for start, end, label in annot["entities"]: # add character indexes
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    doc.ents = ents # label the text with the ents
    db.add(doc)

os.chdir(r'/content')
db.to_disk("./train.spacy") # save the docbin object

100%|██████████| 11/11 [00:00<00:00, 974.66it/s]

Skipping entity





## Creating the Config file for training the model

https://spacy.io/usage/training#config

Download the config file by choosing ner as component from the above link.

File name will be base_config.cfg

## Filling the Config file

"After you’ve saved the starter config to a file base_config.cfg, you can use the init fill-config command to fill in the remaining defaults. 

Training configs should always be complete and without hidden defaults, to keep your experiments reproducible." As gievn in Spacy doc



In [24]:
!python -m spacy init fill-config base_config.cfg config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


## Run the Training configuration

In [27]:
#last parameter is for test data, but here both train and test data is kept same
!python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./train.spacy 

[38;5;4mℹ Using CPU[0m
[1m
[2022-02-04 05:40:08,816] [INFO] Set up nlp object from config
[2022-02-04 05:40:08,827] [INFO] Pipeline: ['tok2vec', 'ner']
[2022-02-04 05:40:08,833] [INFO] Created vocabulary
[2022-02-04 05:40:08,833] [INFO] Finished initializing nlp object
[2022-02-04 05:40:09,011] [INFO] Initialized pipeline components: ['tok2vec', 'ner']
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     41.50    0.00    0.00    0.00    0.00
200     200          1.10    476.85  100.00  100.00  100.00    1.00
400     400          0.00      0.00  100.00  100.00  100.00    1.00
600     600          0.00      0.00  100.00  100.00  100.00    1.00
800     800          0.00      0.00  100.00  100.00  100.00    1.00
1000    1000          0.00      0.00  10

In [33]:
nlp1 = spacy.load("./output/model-best") #load the best model
doc = nlp1("there was a flight named D16") # input sample text

spacy.displacy.render(doc, style="ent", jupyter=True) # display in Jupyter

In [34]:
doc1 = nlp1('The F15 aircraft uses a lot of fuel')
for ent in doc.ents:
    print(ent.label_)

aircraft
