<a href="https://colab.research.google.com/github/siddheshdosi/Name_Entity_Recognization/blob/main/Name_Entity_Recognization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
import spacy
import random

In [4]:
nlp = spacy.load('en_core_web_sm')
nlp.pipe_names

['tagger', 'parser', 'ner']

In [7]:
doc = nlp('India is the second largest population country after China')
for ent in doc.ents:
  print(ent.text,ent.start_char,ent.end_char,ent.label_)

India 0 5 GPE
second 13 19 ORDINAL
China 53 58 GPE


In [8]:
doc = nlp('I am Siddhesh Dosi and curently I am working in Ascena Retail Group as data scientist')
for ent in doc.ents:
  print(ent.text,ent.start_char,ent.end_char,ent.label_)

Siddhesh Dosi 5 18 PERSON
Ascena Retail Group 48 67 ORG


# Create Custom NER

In [9]:
TRAIN_DATA = [('what is the price of polo?', {'entities': [(21, 25, 'PrdName')]}), 
              ('what is the price of ball?', {'entities': [(21, 25, 'PrdName')]}), 
              ('what is the price of jegging?', {'entities': [(21, 28, 'PrdName')]}), 
              ('what is the price of t-shirt?', {'entities': [(21, 28, 'PrdName')]}), 
              ('what is the price of jeans?', {'entities': [(21, 26, 'PrdName')]}), 
              ('what is the price of bat?', {'entities': [(21, 24, 'PrdName')]}), 
              ('what is the price of shirt?', {'entities': [(21, 26, 'PrdName')]}), 
              ('what is the price of bag?', {'entities': [(21, 24, 'PrdName')]}), 
              ('what is the price of cup?', {'entities': [(21, 24, 'PrdName')]}), 
              ('what is the price of jug?', {'entities': [(21, 24, 'PrdName')]}), 
              ('what is the price of plate?', {'entities': [(21, 26, 'PrdName')]}), 
              ('what is the price of glass?', {'entities': [(21, 26, 'PrdName')]}), 
              ('what is the price of moniter?', {'entities': [(21, 28, 'PrdName')]}), 
              ('what is the price of desktop?', {'entities': [(21, 28, 'PrdName')]}), 
              ('what is the price of bottle?', {'entities': [(21, 27, 'PrdName')]}), 
              ('what is the price of mouse?', {'entities': [(21, 26, 'PrdName')]}), 
              ('what is the price of keyboad?', {'entities': [(21, 28, 'PrdName')]}), 
              ('what is the price of chair?', {'entities': [(21, 26, 'PrdName')]}), 
              ('what is the price of table?', {'entities': [(21, 26, 'PrdName')]}), 
              ('what is the price of watch?', {'entities': [(21, 26, 'PrdName')]})]

In [14]:
def train_spacy(data,iterations):
    TRAIN_DATA = data
    nlp = spacy.blank('en')  # create blank Language class
    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last=True)
       

    # add labels
    for _, annotations in TRAIN_DATA:
         for ent in annotations.get('entities'):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()
        for itn in range(iterations):
            print("Starting iteration " + str(itn))
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in TRAIN_DATA:
                nlp.update(
                    [text],  # batch of texts
                    [annotations],  # batch of annotations
                    drop=0.2,  # dropout - make it harder to memorise data
                    sgd=optimizer,  # callable to update weights
                    losses=losses)
            print(losses)
    return nlp
  
prdnlp = train_spacy(TRAIN_DATA, 20)

Starting iteration 0
{'ner': 49.08064279760765}
Starting iteration 1
{'ner': 2.623141813894904}
Starting iteration 2
{'ner': 2.54797519045618}
Starting iteration 3
{'ner': 1.9891381357630273}
Starting iteration 4
{'ner': 0.8058330164217723}
Starting iteration 5
{'ner': 1.008574696641288}
Starting iteration 6
{'ner': 5.293108504865859}
Starting iteration 7
{'ner': 2.8336186319740997}
Starting iteration 8
{'ner': 2.1055557291492693}
Starting iteration 9
{'ner': 0.9575473366716688}
Starting iteration 10
{'ner': 3.576508227817468}
Starting iteration 11
{'ner': 1.4557244176555884}
Starting iteration 12
{'ner': 3.484738441269903}
Starting iteration 13
{'ner': 3.3905825884209824}
Starting iteration 14
{'ner': 0.9535278853753238}
Starting iteration 15
{'ner': 1.4501121944136688}
Starting iteration 16
{'ner': 5.312898688215141}
Starting iteration 17
{'ner': 2.294165147634266}
Starting iteration 18
{'ner': 1.5991253485074346}
Starting iteration 19
{'ner': 0.04940585291852661}


In [16]:
test_text = 'what is the price of pen'
doc = prdnlp(test_text)
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

pen 21 24 PrdName
