In [None]:
# install sp
import sys
!{sys.executable} -m pip install -U spacy

In [1]:
import spacy
import random

In [None]:
nlp = spacy.load('en_core_web_sm')

## Train the spacy entity recognizition model.

In [None]:
table_names = ['employee', 'organization', 'department']
column_names = ['id', 'name', 'email', 'address', 'phone_number', 'date_of_birth']

In [None]:
all_input_vectors = table_names + column_names

In [2]:
train_data = [
    ("get list of employees", {
        'entities': [(12, 20, 'Table')]
    }),
    ("get list of employees with name Nilepta", {
        'entities': [(12, 20, 'Table'), (27, 31, 'Column'), (32, 39, 'Value')]
    }),
    ("search employees with name Nilepta", {
        'entities': [(7, 16, 'Table'), (22, 26, 'Column')]
    }),
    ("find list of employees who live in Washington state", {
        'entities': [(13, 22, 'Table'), (46, 51, 'Column'), (35, 45, 'Value')]
    }),
    ("get details of all employees from state Washington", {
        'entities': [(19, 28, 'Table'), (34, 39, 'Column'), (40, 50, 'Value')]
    }),
    ("find employee with phone number as 2345678901", {
        'entities': [(5, 13, 'Table'), (19, 24, 'Column'), (35, 45, 'Value')]
    }),
    ("get list of employees from Washington state", {
        'entities': [(12, 21, 'Table'), (38, 43, 'Column'), (27, 37, 'Value')]
    }),
    ("find employees whose email is qwerty@gmail.com", {
     'entities': [(5, 14, 'Table'), (21, 26, 'Column'), (30, 46, 'Value')]
    }),
    ("get employee whose email is asdfg@gmail.com", {
        'entities': [(4, 12, 'Table'), (19, 24, 'Column'), (28, 43, 'Value')]
    })
    ]

In [3]:
# nlp create model.
nlp = spacy.blank('en')

In [4]:
ner = nlp.create_pipe('ner')

In [5]:
nlp.add_pipe(ner)

In [6]:
# add labelled data to the model.
for _, annotations in train_data:
    for ent in annotations.get('entities'):
        ner.add_label(ent[2])

In [7]:
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']

In [9]:
n_iter = 20

In [10]:
with nlp.disable_pipes(*other_pipes):  # only train NER
    optimizer = nlp.begin_training()
    for itn in range(n_iter):
        random.shuffle(train_data)
        losses = {}
        for text, annotations in train_data:
            nlp.update([text], [annotations], sgd=optimizer, drop=0.35, losses=losses)
        print(losses)

{'ner': 48.832067865878344}
{'ner': 35.658218862488866}
{'ner': 31.5055147993844}
{'ner': 24.733343594267353}
{'ner': 16.755979170869978}
{'ner': 10.521632771708283}
{'ner': 5.48107267651709}
{'ner': 2.559848444220565}
{'ner': 2.455654755071797}
{'ner': 3.22536460782777}
{'ner': 2.00545993525101}
{'ner': 2.361594494627145}
{'ner': 1.9842200670356336}
{'ner': 3.1655945609672838}
{'ner': 2.532924808787292}
{'ner': 2.0033856140849204}
{'ner': 3.994853970337278}
{'ner': 2.142054011204241}
{'ner': 1.6448378174446052}
{'ner': 2.967424586238052}


In [11]:
# testing the model.
test_text = "get employee with email as rohan@gmail.com"
doc = nlp(test_text)
print("Entities in '%s'" % test_text)
for i, ent in enumerate(doc.ents):
    print("Entity number %s is %s in text: '%s'" % (i, ent.label_, ent.text))
    print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])

Entities in 'get employee with email as rohan@gmail.com'
Entity number 0 is Table in text: 'employee'
Tokens [('get', '', 2), ('employee', 'Table', 3), ('with', '', 2), ('email', 'Column', 3), ('as', '', 2), ('rohan@gmail.com', 'Value', 3)]
Entity number 1 is Column in text: 'email'
Tokens [('get', '', 2), ('employee', 'Table', 3), ('with', '', 2), ('email', 'Column', 3), ('as', '', 2), ('rohan@gmail.com', 'Value', 3)]
Entity number 2 is Value in text: 'rohan@gmail.com'
Tokens [('get', '', 2), ('employee', 'Table', 3), ('with', '', 2), ('email', 'Column', 3), ('as', '', 2), ('rohan@gmail.com', 'Value', 3)]


In [12]:
nlp.to_disk('/mnt/azmnt/code/Users/Congitveservice_NoSQL/NLP/Model')