In [None]:
# install sp
import sys
!{sys.executable} -m pip install -U spacy

In [19]:
import spacy
import random

## Train the spacy entity recognizition model.

In [20]:
train_data = [
    ("get list of employees", {
        'entities': [(12, 20, 'Table')]
    }),
    ("get list of employees with name Nilepta", {
        'entities': [(12, 20, 'Table'), (27, 31, 'Column'), (32, 39, 'Value')]
    }),
    ("search employees with name Nilepta", {
        'entities': [(7, 16, 'Table'), (22, 26, 'Column')]
    }),
    ("find list of employees who live in Washington state", {
        'entities': [(13, 22, 'Table'), (46, 51, 'Column'), (35, 45, 'Value')]
    }),
    ("get details of all employees from state Washington", {
        'entities': [(19, 28, 'Table'), (34, 39, 'Column'), (40, 50, 'Value')]
    }),
    ("find employee with phone number as 2345678901", {
        'entities': [(5, 13, 'Table'), (19, 24, 'Column'), (35, 45, 'Value')]
    }),
    ("get list of employees from Washington state", {
        'entities': [(12, 21, 'Table'), (38, 43, 'Column'), (27, 37, 'Value')]
    }),
    ("find employees whose email is qwerty@gmail.com", {
     'entities': [(5, 14, 'Table'), (21, 26, 'Column'), (30, 46, 'Value')]
    }),
    ("get employee whose email is asdfg@gmail.com", {
        'entities': [(4, 12, 'Table'), (19, 24, 'Column'), (28, 43, 'Value')]
    })
    ]

In [21]:
# nlp create model.
nlp = spacy.blank('en')

In [22]:
ner = nlp.create_pipe('ner')

In [23]:
nlp.add_pipe(ner)

In [24]:
# add labelled data to the model.
for _, annotations in train_data:
    for ent in annotations.get('entities'):
        ner.add_label(ent[2])

In [25]:
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']

In [26]:
n_iter = 20

In [27]:
with nlp.disable_pipes(*other_pipes):  # only train NER
    optimizer = nlp.begin_training()
    for itn in range(n_iter):
        random.shuffle(train_data)
        losses = {}
        for text, annotations in train_data:
            nlp.update([text], [annotations], sgd=optimizer, drop=0.35, losses=losses)
        print(losses)

{'ner': 48.06864865496755}
{'ner': 32.6895731506811}
{'ner': 31.50973227131272}
{'ner': 24.738726106341346}
{'ner': 20.175611039048817}
{'ner': 12.369157264173484}
{'ner': 7.924214684312114}
{'ner': 4.877818860630185}
{'ner': 2.1710149929853557}
{'ner': 0.9634815273257127}
{'ner': 3.2962246101248565}
{'ner': 3.802578118814646}
{'ner': 3.059533075291307}
{'ner': 1.482130764071137}
{'ner': 2.209023112724146}
{'ner': 2.812084155380485}
{'ner': 2.0656549037019927}
{'ner': 1.7738983929786547}
{'ner': 0.6991681326590278}
{'ner': 0.7264539918501943}


In [28]:
# testing the model.
test_text = "get employee with email as rohan@gmail.com"
doc = nlp(test_text)
print("Entities in '%s'" % test_text)
for i, ent in enumerate(doc.ents):
    print("Entity number %s is %s in text: '%s'" % (i, ent.label_, ent.text))

Entities in 'get employee with email as rohan@gmail.com'
Entity number 0 is Table in text: 'employee'
Entity number 1 is Column in text: 'email'
Entity number 2 is Value in text: 'rohan@gmail.com'


In [15]:
ner_bytes = ner.to_bytes('/mnt/azmnt/code/Users/Source/Congitveservice_NoSQL/NLP/ner-model-1b')

In [29]:
nlp_bytes = nlp.to_bytes()

In [30]:
f = open('/mnt/azmnt/code/Users/Source/Congitveservice_NoSQL/NLP/ner-model-1c', 'wb+')

In [31]:
f.write(nlp_bytes)

4143381

In [32]:
f.close()