## Load Packages

In [4]:
from __future__ import unicode_literals, print_function
import plac
import random
from pathlib import Path
import spacy
from tqdm import tqdm 

In [6]:
nlp1 = spacy.load('en')

## Working of NER

In [7]:
docx1 = nlp1(u"Who is Rakshmitha?")

In [8]:
for token in docx1.ents:
    print(token.text,token.start_char, token.end_char,token.label_)

Rakshmitha 7 17 PERSON


In [9]:
docx2 = nlp1(u"Who is Rajinikanth?")

In [10]:
for token in docx2.ents:
    print(token.text,token.start_char, token.end_char,token.label_)

Rajinikanth 7 18 ORG


## Train Data

In [11]:
TRAIN_DATA = [
    ('Who is Rakshmitha?', {
        'entities': [(7, 15, 'PERSON')]
    }),
     ('Who is Rajnikanth?', {
        'entities': [(7, 19, 'PERSON')]
    }),
    ('I like London and Berlin.', {
        'entities': [(7, 13, 'LOC'), (18, 24, 'LOC')]
    })
]

## Define our variables

In [12]:
model = None
output_dir=Path("C:\\Users\\raksh\\Documents\\ner")
n_iter=100

## Load the model

In [13]:
if model is not None:
    nlp = spacy.load(model)  
    print("Loaded model '%s'" % model)
else:
    nlp = spacy.blank('en')  
    print("Created blank 'en' model")

Created blank 'en' model


## Set up the pipeline

In [14]:
if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe(ner, last=True)
else:
    ner = nlp.get_pipe('ner')

## Train the Recognizer

In [15]:
for _, annotations in TRAIN_DATA:
    for ent in annotations.get('entities'):
        ner.add_label(ent[2])

other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):  # only train NER
    optimizer = nlp.begin_training()
    for itn in range(n_iter):
        random.shuffle(TRAIN_DATA)
        losses = {}
        for text, annotations in tqdm(TRAIN_DATA):
            nlp.update(
                [text],  
                [annotations],  
                drop=0.5,  
                sgd=optimizer,
                losses=losses)
        print(losses)

100%|██████████| 3/3 [00:00<00:00, 26.17it/s]
100%|██████████| 3/3 [00:00<00:00, 36.46it/s]
100%|██████████| 3/3 [00:00<00:00, 36.41it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 10.14213179051876}
{'ner': 9.497877180576324}
{'ner': 7.804744482040405}


100%|██████████| 3/3 [00:00<00:00, 35.97it/s]
100%|██████████| 3/3 [00:00<00:00, 36.58it/s]
100%|██████████| 3/3 [00:00<00:00, 38.81it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 6.053979707881808}
{'ner': 5.12600477039814}
{'ner': 4.172462904360145}


100%|██████████| 3/3 [00:00<00:00, 36.40it/s]
100%|██████████| 3/3 [00:00<00:00, 35.42it/s]
100%|██████████| 3/3 [00:00<00:00, 37.57it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 4.1427093129605055}
{'ner': 3.386671197376927}
{'ner': 3.3671334834657216}


100%|██████████| 3/3 [00:00<00:00, 36.45it/s]
100%|██████████| 3/3 [00:00<00:00, 36.16it/s]
100%|██████████| 3/3 [00:00<00:00, 31.00it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 3.6120547978935282}
{'ner': 3.9423165889285228}
{'ner': 3.7018950657569496}


100%|██████████| 3/3 [00:00<00:00, 33.30it/s]
100%|██████████| 3/3 [00:00<00:00, 36.47it/s]
100%|██████████| 3/3 [00:00<00:00, 38.26it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 3.451955220411606}
{'ner': 2.370321923691745}
{'ner': 2.8217329911670808}


100%|██████████| 3/3 [00:00<00:00, 34.82it/s]
100%|██████████| 3/3 [00:00<00:00, 34.34it/s]
100%|██████████| 3/3 [00:00<00:00, 36.68it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 2.9808369937320456}
{'ner': 2.0244186372233073}
{'ner': 2.4139918072662896}


100%|██████████| 3/3 [00:00<00:00, 35.03it/s]
100%|██████████| 3/3 [00:00<00:00, 35.35it/s]
100%|██████████| 3/3 [00:00<00:00, 35.07it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 2.922444817817084}
{'ner': 1.79516746071258}
{'ner': 2.706106535798125}


100%|██████████| 3/3 [00:00<00:00, 35.35it/s]
100%|██████████| 3/3 [00:00<00:00, 36.06it/s]
100%|██████████| 3/3 [00:00<00:00, 35.40it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 2.210332112019598}
{'ner': 1.1440190909415493}
{'ner': 0.7827467715011796}


100%|██████████| 3/3 [00:00<00:00, 33.53it/s]
100%|██████████| 3/3 [00:00<00:00, 36.19it/s]
100%|██████████| 3/3 [00:00<00:00, 36.76it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 0.49364996448897}
{'ner': 0.6022216455186762}
{'ner': 3.6576377255895722}


100%|██████████| 3/3 [00:00<00:00, 34.59it/s]
100%|██████████| 3/3 [00:00<00:00, 35.47it/s]
100%|██████████| 3/3 [00:00<00:00, 34.69it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 0.21988466736041443}
{'ner': 0.08440591531707797}
{'ner': 1.0148854647144467}


100%|██████████| 3/3 [00:00<00:00, 34.51it/s]
100%|██████████| 3/3 [00:00<00:00, 36.44it/s]
100%|██████████| 3/3 [00:00<00:00, 36.34it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 0.23102566462960394}
{'ner': 0.12342620222605795}
{'ner': 0.8638909000757375}


100%|██████████| 3/3 [00:00<00:00, 34.06it/s]
100%|██████████| 3/3 [00:00<00:00, 34.74it/s]
100%|██████████| 3/3 [00:00<00:00, 36.12it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 0.00970919588207899}
{'ner': 0.0017846464690496136}
{'ner': 0.01109981507475518}


100%|██████████| 3/3 [00:00<00:00, 34.47it/s]
100%|██████████| 3/3 [00:00<00:00, 33.87it/s]
100%|██████████| 3/3 [00:00<00:00, 33.22it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 0.02483572233721505}
{'ner': 0.0006503501651453534}
{'ner': 2.8185317859143254e-07}


100%|██████████| 3/3 [00:00<00:00, 31.79it/s]
100%|██████████| 3/3 [00:00<00:00, 34.58it/s]
100%|██████████| 3/3 [00:00<00:00, 33.35it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 0.00018591493454830874}
{'ner': 6.75457447096218e-05}
{'ner': 0.13535210047250804}


100%|██████████| 3/3 [00:00<00:00, 33.21it/s]
100%|██████████| 3/3 [00:00<00:00, 35.56it/s]
100%|██████████| 3/3 [00:00<00:00, 36.52it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 1.711327687123691}
{'ner': 9.74923960838884e-08}
{'ner': 1.1549613460983596e-08}


100%|██████████| 3/3 [00:00<00:00, 32.58it/s]
100%|██████████| 3/3 [00:00<00:00, 35.44it/s]
100%|██████████| 3/3 [00:00<00:00, 35.73it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 2.9018367348373126e-07}
{'ner': 0.00010916395553819299}
{'ner': 1.9965264000667687e-05}


100%|██████████| 3/3 [00:00<00:00, 33.67it/s]
100%|██████████| 3/3 [00:00<00:00, 35.45it/s]
100%|██████████| 3/3 [00:00<00:00, 34.26it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 1.8725290268908607e-06}
{'ner': 2.570537320218373e-09}
{'ner': 9.966500500797535e-05}


100%|██████████| 3/3 [00:00<00:00, 35.03it/s]
100%|██████████| 3/3 [00:00<00:00, 36.33it/s]
100%|██████████| 3/3 [00:00<00:00, 37.94it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 2.716567355455616e-07}
{'ner': 8.253754915913999e-10}
{'ner': 6.755019236111631e-07}


100%|██████████| 3/3 [00:00<00:00, 35.51it/s]
100%|██████████| 3/3 [00:00<00:00, 36.83it/s]
100%|██████████| 3/3 [00:00<00:00, 36.67it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 1.3169963340873584e-07}
{'ner': 3.4339492723877074e-07}
{'ner': 5.995742131804738e-09}


100%|██████████| 3/3 [00:00<00:00, 33.58it/s]
100%|██████████| 3/3 [00:00<00:00, 36.42it/s]
100%|██████████| 3/3 [00:00<00:00, 36.13it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 1.6431998670401708e-10}
{'ner': 4.3056865282157665e-07}
{'ner': 1.1900354064030299e-06}


100%|██████████| 3/3 [00:00<00:00, 36.05it/s]
100%|██████████| 3/3 [00:00<00:00, 34.73it/s]
100%|██████████| 3/3 [00:00<00:00, 36.91it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 1.870560567713642e-08}
{'ner': 3.483247410450605e-08}
{'ner': 0.0001184918002238426}


100%|██████████| 3/3 [00:00<00:00, 36.66it/s]
100%|██████████| 3/3 [00:00<00:00, 37.60it/s]
100%|██████████| 3/3 [00:00<00:00, 34.26it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 5.91824114368283e-09}
{'ner': 1.7046211408351543e-06}
{'ner': 0.0007408987190375411}


100%|██████████| 3/3 [00:00<00:00, 35.10it/s]
100%|██████████| 3/3 [00:00<00:00, 35.27it/s]
100%|██████████| 3/3 [00:00<00:00, 36.31it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 1.1894515468878877e-10}
{'ner': 1.0461581067088373e-05}
{'ner': 3.003879101255458e-08}


100%|██████████| 3/3 [00:00<00:00, 35.70it/s]
100%|██████████| 3/3 [00:00<00:00, 36.57it/s]
100%|██████████| 3/3 [00:00<00:00, 36.26it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 4.0105240545408065e-07}
{'ner': 7.851957419355631e-11}
{'ner': 1.2310734608609853e-06}


100%|██████████| 3/3 [00:00<00:00, 37.39it/s]
100%|██████████| 3/3 [00:00<00:00, 34.64it/s]
100%|██████████| 3/3 [00:00<00:00, 37.03it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 1.4532352391370107e-09}
{'ner': 4.623981416138481e-09}
{'ner': 5.239299132389046e-07}


100%|██████████| 3/3 [00:00<00:00, 34.51it/s]
100%|██████████| 3/3 [00:00<00:00, 37.27it/s]
100%|██████████| 3/3 [00:00<00:00, 35.54it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 2.280408580093279e-08}
{'ner': 1.4023520900560863e-08}
{'ner': 7.336606967255153e-10}


100%|██████████| 3/3 [00:00<00:00, 35.96it/s]
100%|██████████| 3/3 [00:00<00:00, 33.93it/s]
100%|██████████| 3/3 [00:00<00:00, 34.53it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 0.0028724787187983662}
{'ner': 9.818127472804709e-05}
{'ner': 1.291662536959284e-09}


100%|██████████| 3/3 [00:00<00:00, 34.04it/s]
100%|██████████| 3/3 [00:00<00:00, 36.77it/s]
100%|██████████| 3/3 [00:00<00:00, 37.53it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 0.00023248122180163355}
{'ner': 9.991508021499316e-12}
{'ner': 1.0875775549535956e-07}


100%|██████████| 3/3 [00:00<00:00, 34.34it/s]
100%|██████████| 3/3 [00:00<00:00, 35.47it/s]
100%|██████████| 3/3 [00:00<00:00, 35.69it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 3.1062144418809344e-09}
{'ner': 3.7987394057393594e-07}
{'ner': 4.341617498126951e-10}


100%|██████████| 3/3 [00:00<00:00, 36.20it/s]
100%|██████████| 3/3 [00:00<00:00, 34.36it/s]
100%|██████████| 3/3 [00:00<00:00, 35.47it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 3.685676296656774e-09}
{'ner': 4.2477060624348625e-11}
{'ner': 1.555314982763765e-08}


100%|██████████| 3/3 [00:00<00:00, 34.41it/s]
100%|██████████| 3/3 [00:00<00:00, 35.72it/s]
100%|██████████| 3/3 [00:00<00:00, 36.44it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 3.714793410929864e-10}
{'ner': 5.287610558482392e-07}
{'ner': 2.332234153795826e-09}


100%|██████████| 3/3 [00:00<00:00, 34.61it/s]
100%|██████████| 3/3 [00:00<00:00, 36.63it/s]
100%|██████████| 3/3 [00:00<00:00, 37.00it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 1.5618143045412516e-05}
{'ner': 8.752450626551187e-10}
{'ner': 2.732600978673577e-07}


100%|██████████| 3/3 [00:00<00:00, 32.53it/s]
100%|██████████| 3/3 [00:00<00:00, 36.72it/s]
100%|██████████| 3/3 [00:00<00:00, 36.06it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 4.529118880064664e-10}
{'ner': 2.0142931464943953e-10}
{'ner': 6.936708089143395e-05}


100%|██████████| 3/3 [00:00<00:00, 34.53it/s]

{'ner': 7.402782809313387e-10}





## Test the trained model

In [16]:
for text, _ in TRAIN_DATA:
    doc = nlp(text)
    print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
    print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])

Entities [('London', 'LOC'), ('Berlin', 'LOC')]
Tokens [('I', '', 2), ('like', '', 2), ('London', 'LOC', 3), ('and', '', 2), ('Berlin', 'LOC', 3), ('.', '', 2)]
Entities []
Tokens [('Who', '', 2), ('is', '', 2), ('Rajnikanth', '', 2), ('?', '', 2)]
Entities []
Tokens [('Who', '', 2), ('is', '', 2), ('Rakshmitha', '', 2), ('?', '', 2)]


## Save the model

In [17]:
if output_dir is not None:
    output_dir = Path(output_dir)
    if not output_dir.exists():
        output_dir.mkdir()
    nlp.to_disk(output_dir)
    print("Saved model to", output_dir)        

Saved model to C:\Users\raksh\Documents\ner


## Test the saved model

In [18]:
print("Loading from", output_dir)
nlp2 = spacy.load(output_dir)
for text, _ in TRAIN_DATA:
    doc = nlp2(text)
    print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
    print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])

Loading from C:\Users\raksh\Documents\ner
Entities [('London', 'LOC'), ('Berlin', 'LOC')]
Tokens [('I', '', 2), ('like', '', 2), ('London', 'LOC', 3), ('and', '', 2), ('Berlin', 'LOC', 3), ('.', '', 2)]
Entities []
Tokens [('Who', '', 2), ('is', '', 2), ('Rajnikanth', '', 2), ('?', '', 2)]
Entities []
Tokens [('Who', '', 2), ('is', '', 2), ('Rakshmitha', '', 2), ('?', '', 2)]
