In [None]:
# install sp
import sys
!{sys.executable} -m pip install -U spacy

In [1]:
import spacy
import random

In [2]:
nlp = spacy.load('en_core_web_sm')

## Train the spacy entity recognizition model.

In [3]:
table_names = ['employee', 'organization', 'department']
column_names = ['id', 'name', 'email', 'address', 'phone_number', 'date_of_birth']

In [4]:
all_input_vectors = table_names + column_names

In [5]:
labels = ['table', 'column']

train_data = [
    ("get list of employees", {
        'entities': [(12, 20, 'Table')]
    }),
    ("get list of employees with name Nilepta", {
        'entities': [(12, 20, 'Table'), (27, 31, 'Column'), (32, 39, 'Value')]
    }),
    ("search employees with name Nilepta", {
        'entities': [(7, 16, 'Table'), (22, 26, 'Column')]
    }),
    ("find list of employees who live in Washington state", {
        'entities': [(13, 22, 'Table'), (46, 51, 'Column'), (35, 45, 'Value')]
    }),
    ("get details of all employees from state Washington", {
        'entities': [(19, 28, 'Table'), (34, 39, 'Column'), (40, 50, 'Value')]
    }),
    ("find employee with phone number as 2345678901", {
        'entities': [(5, 13, 'Table'), (19, 24, 'Column'), (35, 45, 'Value')]
    }),
    ("get list of employees from Washington state", {
        'entities': [(12, 21, 'Table'), (38, 43, 'Column'), (27, 37, 'Value')]
    }),
    ("find employees whose email is qwerty@gmail.com", {
     'entities': [(5, 14, 'Table'), (21, 26, 'Column'), (30, 46, 'Value')]
    }),
    ("get employee whose email is asdfg@gmail.com", {
        'entities': [(4, 12, 'Table'), (19, 24, 'Column'), (28, 43, 'Value')]
    }),
    ("show me the employee whose email is asdfg@gmail.com", {
        'entities': [(12, 20, 'Table'), (27, 32, 'Column'), (36, 51, 'Value')]
    }),
    ("get employee whose email is asdfg@gmail.com and whose from Washington state", {
        'entities': [(4, 12, 'Table'), (19, 24, 'Column'), (28, 43, 'Value'), (70, 75,'Column'), (59,69,'Value')]
    }),
    ("get employee whose email is asdfg@gmail.com and whose state is Washington", {
        'entities': [(4, 12, 'Table'), (19, 24, 'Column'), (28, 43, 'Value'), (54, 59,'Column'), (63,73,'Value')]
    }),
    ("get employees whose licenses are gjlkddfbgfa and whose state is Washington", {
        'entities': [(4, 12, 'Table'), (20, 28, 'Column'), (33, 41, 'Value'), (54, 59,'Column'), (63,73,'Value')]
    }),
    ("get employees whose manager is Becky and whose state is Washington", {
        'entities': [(4, 13, 'Table'), (20, 25, 'Column'), (31, 36, 'Value'), (47, 52,'Column'), (56,66,'Value')]
    }),
    ("get list of employees from Washington state or California state", {
        'entities': [(12, 21, 'Table'), (38, 43, 'Column'), (27, 37, 'Value'), (44, 46, 'Operator'), (58, 63, 'Column'), (47, 57, 'Value')]
    }),
    ("get details of all employees from state Delaware or state Michigan", {
        'entities': [(19, 28, 'Table'), (34, 39, 'Column'), (40, 50, 'Value'), (49, 51, 'Operator'), (52, 57, 'Column'), (58, 66, 'Value')]
    }),
    ("find employees whose email is qwerty@gmail.com and state is Michigan", {
     'entities': [(5, 14, 'Table'), (21, 26, 'Column'), (30, 46, 'Value'), (47, 50, 'Operator'), (51, 56, 'Column'), (60, 68, 'Value')]
    }),
    ("get list of employees with name Nilepta and phone number 5550147", {
        'entities': [(12, 20, 'Table'), (27, 31, 'Column'), (32, 39, 'Value'), (40, 43, 'Operator'), (44, 56, 'Column'), (57, 64, 'Value')]
    }),
    ("get list of employees with name Nilepta and email 5550147@hotmail.com", {
        'entities': [(12, 20, 'Table'), (27, 31, 'Column'), (32, 39, 'Value'), (40, 43, 'Operator'), (44, 49, 'Column'), (50, 69, 'Value')]
    })
    ]

In [6]:
# nlp create model.
nlp = spacy.blank('en')

In [7]:
ner = nlp.create_pipe('ner')

In [8]:
nlp.add_pipe(ner)

In [9]:
# add labelled data to the model.
for _, annotations in train_data:
    for ent in annotations.get('entities'):
        ner.add_label(ent[2])

In [10]:
nlp.pipe_names

['ner']

In [11]:
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']

In [12]:
n_iter = 20

In [13]:
with nlp.disable_pipes(*other_pipes):  # only train NER
    optimizer = nlp.begin_training()
    for itn in range(n_iter):
        random.shuffle(train_data)
        losses = {}
        for text, annotations in train_data:
            nlp.update([text], [annotations], sgd=optimizer, drop=0.35, losses=losses)
        print(losses)

{'ner': 120.497575092013}
{'ner': 79.10479178003519}
{'ner': 39.807145819324404}
{'ner': 22.11875925534216}
{'ner': 9.587047803669618}
{'ner': 17.123821176473403}
{'ner': 7.5847445620195675}
{'ner': 5.146356088866558}
{'ner': 5.036458903718314}
{'ner': 3.235791037419337}
{'ner': 8.516045763949272}
{'ner': 3.9337014505166117}
{'ner': 5.8155070745595205}
{'ner': 2.7715696238291043}
{'ner': 2.1985751158682274}
{'ner': 4.29173132982258}
{'ner': 0.23610946429400542}
{'ner': 6.622291926730245}
{'ner': 2.088216057916211}
{'ner': 4.597178712432043}


In [14]:
# testing the model.
test_text = "get list of employees with name Nilepta and email 5550147@gmail.com"
doc = nlp(test_text)
print("Entities in '%s'" % test_text)
for i, ent in enumerate(doc.ents):
    print("Entity number %s is %s in text: '%s'" % (i, ent.label_, ent.text))

Entities in 'get list of employees with name Nilepta and email 5550147@gmail.com'
Entity number 0 is Table in text: 'employees'
Entity number 1 is Column in text: 'name'
Entity number 2 is Value in text: 'Nilepta'
Entity number 3 is Operator in text: 'and'
Entity number 4 is Column in text: 'email'
Entity number 5 is Value in text: '5550147@gmail.com'


In [None]:
from spacy.vocab import Vocab
vocab = Vocab(strings=all_input_vectors)

In [None]:
from spacy.pipeline import EntityRecognizer

ner = EntityRecognizer(vocab)
nlp.add_pipe(ner)

In [None]:
optimizer = nlp.begin_training()

In [None]:
"get list of employees with name Nilepta and email 5550147@hotmail.com".index("5550147@hotmail.com")

In [None]:
len("5550147@hotmail.com")

In [None]:
len("employee")

In [None]:
len("email")