In [None]:
# install sp
import sys
!{sys.executable} -m pip install -U spacy

In [21]:
import spacy
import random

In [2]:
nlp = spacy.load('en_core_web_sm')

## Train the spacy entity recognizition model.

In [3]:
table_names = ['employee', 'organization', 'department']
column_names = ['id', 'name', 'email', 'address', 'phone_number', 'date_of_birth']

In [4]:
all_input_vectors = table_names + column_names

In [16]:
labels = ['table', 'column']

train_data = [
    ("get list of employees", {
        'entities': [(12, 20, 'Table')]
    }),
    ("get list of employees with name Nilepta", {
        'entities': [(12, 20, 'Table'), (27, 31, 'Column'), (32, 39, 'Value')]
    }),
    ("search employees with name Nilepta", {
        'entities': [(7, 16, 'Table'), (22, 26, 'Column')]
    }),
    ("find list of employees who live in Washington state", {
        'entities': [(13, 22, 'Table'), (46, 51, 'Column'), (35, 45, 'Value')]
    }),
    ("get details of all employees from state Washington", {
        'entities': [(19, 28, 'Table'), (34, 39, 'Column'), (40, 50, 'Value')]
    }),
    ("find employee with phone number as 2345678901", {
        'entities': [(5, 13, 'Table'), (19, 24, 'Column'), (35, 45, 'Value')]
    }),
    ("get list of employees from Washington state", {
        'entities': [(12, 21, 'Table'), (38, 43, 'Column'), (27, 37, 'Value')]
    }),
    ("find employees whose email is qwerty@gmail.com", {
     'entities': [(5, 14, 'Table'), (21, 26, 'Column'), (30, 46, 'Value')]
    }),
    ("get employee whose email is asdfg@gmail.com", {
        'entities': [(4, 12, 'Table'), (19, 24, 'Column'), (28, 43, 'Value')]
    })
    ]

In [11]:
# nlp create model.
nlp = spacy.blank('en')

In [12]:
ner = nlp.create_pipe('ner')

In [13]:
nlp.add_pipe(ner)

In [17]:
# add labelled data to the model.
for _, annotations in train_data:
    for ent in annotations.get('entities'):
        ner.add_label(ent[2])

In [18]:
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']

In [19]:
n_iter = 10

In [22]:
with nlp.disable_pipes(*other_pipes):  # only train NER
    optimizer = nlp.begin_training()
    for itn in range(n_iter):
        random.shuffle(train_data)
        losses = {}
        for text, annotations in train_data:
            nlp.update([text], [annotations], sgd=optimizer, drop=0.35, losses=losses)
        print(losses)

{'ner': 49.35320996120572}
{'ner': 31.912368553254055}
{'ner': 30.86327546251316}
{'ner': 24.400518887433122}
{'ner': 20.008653647564884}
{'ner': 11.515644887920672}
{'ner': 7.235697801928495}
{'ner': 3.28906892614385}
{'ner': 2.7888558081895094}
{'ner': 2.342875523543452}


In [25]:
# testing the model.
test_text = "get employee with email as rohan@gmail.com"
doc = nlp(test_text)
print("Entities in '%s'" % test_text)
for i, ent in enumerate(doc.ents):
    print("Entity number %s is %s in text: '%s'" % (i, ent.label_, ent.text))

Entities in 'get employee with email as rohan@gmail.com'
Entity number 0 is Table in text: 'employee'
Entity number 1 is Column in text: 'email'
Entity number 2 is Value in text: 'rohan@gmail.com'


In [None]:
from spacy.vocab import Vocab
vocab = Vocab(strings=all_input_vectors)

In [None]:
from spacy.pipeline import EntityRecognizer

ner = EntityRecognizer(vocab)
nlp.add_pipe(ner)

In [None]:
optimizer = nlp.begin_training()

In [7]:
"search employees with name Nilepta".index("name")

22

In [8]:
len("Nilepta")

7

In [6]:
len("employees")

9

In [9]:
len("name")

4