##### Named Entity Recognition using SpaCy 
An Example of NER for a custom dataset found in <a href = "https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus">here</a> 

References used:
https://towardsdatascience.com/custom-named-entity-recognition-using-spacy-7140ebbb3718 <br>
https://medium.com/@nikita25.pardesi/formatting-spacy-custom-training-data-the-easier-way-3aa4f35f6112<br>
https://spacy.io/usage/training#basics<br>
https://medium.com/@manivannan_data/how-to-train-ner-with-custom-training-data-using-spacy-188e0e508c6<br>

In [1]:
#Import packages
import spacy
import pandas as pd
import numpy as np

import csv
import json

import random
import re
from tqdm import tqdm

In [2]:
#Read dataset
ner_dataset = pd.read_csv("ner_dataset.csv", encoding = "cp1252")

In [3]:
#View dataset
ner_dataset.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [4]:
#Drop the columns that aren't necessary
ner_dataset = ner_dataset.drop(labels = ['Sentence #', 'POS'],
                               axis = 1)

In [5]:
#Converting to spacy training format
#Example : ("Uber blew through $1 million a week", [(0, 4, 'ORG')])
#The start and end position of the word and the label for the entity is given in a tuple.

l1 = []
l2 = []

for i in tqdm(range(len(ner_dataset['Word']))):
    l1.append(ner_dataset['Word'][i])
    l2.append({'entities' : [(0, len(ner_dataset['Word'][i]), ner_dataset["Tag"][i])]})
TRAIN_DATA = list(zip(l1, l2))

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1048575/1048575 [01:14<00:00, 14042.75it/s]


In [6]:
len(TRAIN_DATA)

1048575

In [7]:
#Defining a function to train a blank SpaCy model
def train_spacy(data,iterations):
    #Assign the training data
    TRAIN_DATA = data
    
    #Initialize a blank SpaCy model
    model = spacy.blank('en')  
    print("Model created")
    
    #Add NER to the model's pipeline, if it's not present already.
    if 'ner' not in model.pipe_names:
        ner = model.create_pipe('ner')
        model.add_pipe(ner, last=True)
    print("NER added to pipeline")
    
    #Adding the labels to the model
    for _, annotations in TRAIN_DATA:
         for ent in annotations.get('entities'):
            ner.add_label(ent[2])
    print("Labels added")
    
    #Disabling other functions in the pipepline while training the model
    other_pipes = [pipe for pipe in model.pipe_names if pipe != 'ner']
    with model.disable_pipes(*other_pipes): 
        #Begin training
        print("Other functions disabled")
        optimizer = model.begin_training()
        print("Training")
        for itn in range(iterations):
            print("Statring iteration " + str(itn))
            #Train the models in a random order
            random.shuffle(TRAIN_DATA)
#             i = 0
            losses = {}
    
            #Update the model for each entity
            for n in tqdm(range(len(TRAIN_DATA))):
                text = TRAIN_DATA[n][0]
                annotations = TRAIN_DATA[n][1]
#                 print(i)
#                 i = i + 1
                model.update(
                    [text],
                    [annotations],
                    drop=0.2,
                    sgd=optimizer,
                    losses=losses)
#                 print("Done :", text)
            print(losses)
        print("Training complete")
    return model

In [None]:
#Train a model
ner_model = train_spacy(TRAIN_DATA, 5)

Model created
NER added to pipeline
Labels added
Other functions disabled
Training
Statring iteration 0


 56%|█████████████████████████████████████████████████████████████████████████████████████████                                                                     | 590782/1048575 [15:19:44<10:15:57, 12.39it/s]

In [None]:
ner_model.meta['name'] = "NER_MODEL_1" 
ner_model.to_disk("ner_model_1")
print("Saved model")

In [None]:
test_text = "Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country."

In [None]:
doc = ner_model(test_text)

In [None]:
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)