In [None]:
# install sp
import sys
!{sys.executable} -m pip install -U spacy

In [1]:
import spacy
import random

## Train the text classification model.

In [2]:
train_data = [
    ('get list of employees', {'cats': {'search': 0.91}}),
    ('get list of employees with name Nilepta', {'cats': {'search': 0.92}}),
    ('search employees with name Nilepta', {'cats': {'search': 0.93}}),
    ('find list of employees who live in Washington state', {'cats': {'search': 0.94}}),
    ('get details of all employees from state Washington', {'cats': {'search': 0.89}}),
    ('find employee with phone number as 2345678901', {'cats': {'search': 0.99}}),
    ('get list of employees from Washington state', {'cats': {'search': 0.992}}),
    ('find employees whose email is qwerty@gmail.com', {'cats': {'search': 0.993}}),
    ('get employee whose email is asdfg@gmail.com', {'cats': {'search': 0.994}}),
    ('get employee whose salary is between 100 and 200', {'cats': {'range': 0.99}}),
    ('get employee whose salary is above 100', {'cats': {'range': 0.992}}),
    ('get employee whose salary is below 100', {'cats': {'range': 0.993}}),
    ]

In [3]:
for text, annotations in train_data:
    print(text, annotations)

get list of employees {'cats': {'search': 0.91}}
get list of employees with name Nilepta {'cats': {'search': 0.92}}
search employees with name Nilepta {'cats': {'search': 0.93}}
find list of employees who live in Washington state {'cats': {'search': 0.94}}
get details of all employees from state Washington {'cats': {'search': 0.89}}
find employee with phone number as 2345678901 {'cats': {'search': 0.99}}
get list of employees from Washington state {'cats': {'search': 0.992}}
find employees whose email is qwerty@gmail.com {'cats': {'search': 0.993}}
get employee whose email is asdfg@gmail.com {'cats': {'search': 0.994}}
get employee whose salary is between 100 and 200 {'cats': {'range': 0.99}}
get employee whose salary is above 100 {'cats': {'range': 0.992}}
get employee whose salary is below 100 {'cats': {'range': 0.993}}


In [4]:
# nlp create model.
nlp = spacy.blank('en')

In [5]:
textcat = nlp.create_pipe('textcat')

In [6]:
nlp.add_pipe(textcat)

In [7]:
# add labelled data to the model.
textcat.add_label('search')
textcat.add_label('range')

1

In [8]:
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']

In [9]:
n_iter = 20

In [12]:
with nlp.disable_pipes(*other_pipes):  # only train NER
    optimizer = nlp.begin_training()
    for itn in range(n_iter):
        random.shuffle(train_data)
        losses = {}
        for text, annotations in train_data:
            nlp.update([text], [annotations], sgd=optimizer, drop=0.35, losses=losses)
        print(losses)

{'textcat': 4.8491813712753356}
{'textcat': 4.286091523885261}
{'textcat': 0.47355836396729956}
{'textcat': 1.3984007939579897}
{'textcat': 0.6296343083413376}
{'textcat': 1.122080572278719}
{'textcat': 0.04600611707246571}
{'textcat': 0.03053639830704924}
{'textcat': 0.10770763587424881}
{'textcat': 0.03747039042136535}
{'textcat': 0.04094115480984328}
{'textcat': 0.43272675330626953}
{'textcat': 0.03480597232328364}
{'textcat': 0.035273027351649944}
{'textcat': 0.08788099190860521}
{'textcat': 0.03352015775226391}
{'textcat': 0.3645384983465192}
{'textcat': 0.03801771286907751}
{'textcat': 0.061115093008993426}
{'textcat': 0.035783317471214104}


In [16]:
# testing the model.
test_text = "get employee with salary is below 100"
doc = nlp(test_text)
print("Entities in '%s'" % test_text)
for i, ent in enumerate(doc.cats):
    print((i, ent))

Entities in 'get employee with salary is below 100'
(0, 'range')
(1, 'search')


In [None]:
nlp.to_disk('/mnt/azmnt/code/Users/Congitveservice_NoSQL/NLP/Model')

In [None]:
import thinc.extra.datasets

def load_data(limit=0, split=1):
    """Load data from the IMDB dataset."""
    # Partition off part of the train data for evaluation
    train_data, _ = thinc.extra.datasets.imdb()
    random.shuffle(train_data)
    train_data = train_data[-limit:]
    texts, labels = zip(*train_data)
    cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in labels]
    split = int(len(train_data) * split)
    return (texts[:split], cats[:split]), (texts[split:], cats[split:])

In [None]:
a, b = load_data()

In [None]:
len(a)

In [None]:
len(a[0])

In [None]:
a[0][1]

In [None]:
len(b)

In [None]:
b[0]

In [None]:
b[1]

In [None]:
a[1][1]

In [None]:
a[0][1]