In [83]:
from __future__ import unicode_literals, print_function

import random
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding



first_names = ["Donna", "donna", "Mandy", "mandy", "Colin", "colin", "Janis", "janis", "Brian", "brian"]
full_names = ["Donna Graves", "donna graves", "Mandy Green", "mandy green", "Colin Banger", "colin banger", 
              "Janis Michael", "janis michael", "Brian Elliot", "brian elliot"]

TRAIN_DATA = []

for name in first_names:
    TRAIN_DATA.append(("Find me {}'s office".format(name), 
              { "heads": [0, 0, 4, 2, 0],  # index of token head
                "deps": ["ROOT", "-", "PERSON", "-", "PLACE"]}))
    TRAIN_DATA.append(("Could you find me {}'s office".format(name), 
              { "heads": [2, 2, 2, 2, 6, 4, 2],  # index of token head
                "deps": ["-", "-", "ROOT", "-", "PERSON", "-", "PLACE"]}))
    TRAIN_DATA.append(("Where is {}'s office?".format(name), 
              { "heads": [1, 1, 4, 2, 1, 1],  # index of token head
                "deps": ["-", "ROOT", "PERSON", "-", "PLACE", "-"]}))
    TRAIN_DATA.append(("Where can I find {}'s office?".format(name), 
              { "heads": [3, 3, 3, 3, 6, 4, 3, 3],  # index of token head
                "deps": ["-", "-", "-", "ROOT", "PERSON", "-", "PLACE", "-"]}))
    TRAIN_DATA.append(("Let me know {}'s office".format(name), 
              { "heads": [2, 2, 2, 5, 3, 2],  # index of token head
                "deps": ["-", "-", "ROOT", "PERSON", "-", "PLACE"]}))

for name in full_names:
    TRAIN_DATA.append(("Find me {}'s office".format(name), 
              { "heads": [0, 0, 3, 5, 3, 0],  # index of token head
                "deps": ["ROOT", "-", "PERSON", "PERSON", "-", "PLACE"]}))
    TRAIN_DATA.append(("Could you find me {}'s office".format(name), 
              { "heads": [2, 2, 2, 2, 6, 7, 6, 2],  # index of token head
                "deps": ["-", "-", "ROOT", "-", "PERSON", "PERSON", "-", "PLACE"]}))
    TRAIN_DATA.append(("Where is {}'s office?".format(name), 
              { "heads": [1, 1, 3, 5, 3, 1, 1],  # index of token head
                "deps": ["-", "ROOT", "PERSON", "PERSON", "-", "PLACE", "-"]}))
    TRAIN_DATA.append(("Where can I find {}'s office?".format(name), 
              { "heads": [3, 3, 3, 3, 5, 7, 5, 3, 3],  # index of token head
                "deps": ["-", "-", "-", "ROOT", "PERSON", "PERSON", "-", "PLACE", "-"]}))
    TRAIN_DATA.append(("Let me know {}'s office".format(name), 
              { "heads": [2, 2, 2, 4, 6, 4, 2],  # index of token head
                "deps": ["-", "-", "ROOT", "PERSON", "PERSON", "-", "PLACE"]}))    

def main(model=None, output_dir=None, n_iter=15):
    """Load the model, set up the pipeline and train the parser."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")

    # We'll use the built-in dependency parser class, but we want to create a
    # fresh instance – just in case.
    if "parser" in nlp.pipe_names:
        nlp.remove_pipe("parser")
    parser = nlp.create_pipe("parser")
    nlp.add_pipe(parser, first=True)

    for text, annotations in TRAIN_DATA:
        for dep in annotations.get("deps", []):
            parser.add_label(dep)

    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "parser"]
    with nlp.disable_pipes(*other_pipes):  # only train parser
        optimizer = nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, losses=losses)
            print("Losses", losses)

    # test the trained model
    test_model(nlp)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        test_model(nlp2)


def test_model(nlp):
    texts = [
        "find office",
        "find Donna's office",
        "find me Mandy Green's office",
        "how can I find Donna's office"
    ]
    docs = nlp.pipe(texts)
    for doc in docs:
        print(doc.text)
        print([(t.text, t.dep_, t.head.text) for t in doc if t.dep_ != "-"])

In [84]:
main("en_core_web_md", output_dir="./model")
#main("./model", output_dir="./model")

Loaded model 'en_core_web_md'
Losses {'parser': 296.3037513503805}
Losses {'parser': 39.11321369397722}
Losses {'parser': 43.93260544572475}
Losses {'parser': 25.84486157105127}
Losses {'parser': 26.42774046420255}
Losses {'parser': 18.257370498060904}
Losses {'parser': 24.0604481315965}
Losses {'parser': 18.78090803173344}
Losses {'parser': 20.44218352009827}
Losses {'parser': 25.971109827173958}
Losses {'parser': 20.684882341731903}
Losses {'parser': 12.434856218369823}
Losses {'parser': 11.051060879968944}
Losses {'parser': 14.115828102680501}
Losses {'parser': 16.820759796164797}
find office
[('find', 'ROOT', 'find'), ('office', 'PLACE', 'find')]
find Donna's office
[('find', 'ROOT', 'find'), ('Donna', 'PERSON', 'office'), ('office', 'PLACE', 'find')]
find me Mandy Green's office
[('find', 'ROOT', 'find'), ('Mandy', 'PERSON', 'Green'), ('Green', 'PERSON', 'office'), ('office', 'PLACE', 'find')]
how can I find Donna's office
[('find', 'ROOT', 'find'), ('Donna', 'PERSON', 'office'), 

In [116]:
import spacy
nlp = spacy.load("./model")
# "en_core_web_md"
print(nlp.pipe_names)

#text = "let me know brian elliot's office"
text = "John's place?"
doc = nlp(text)
print([token.head.i for token in doc])
print([token.dep_ for token in doc])
# print([token.pos_ for token in doc])
print([token.text for token in doc])

['parser', 'tagger', 'ner']
[2, 0, 2, 2]
['PERSON', '-', 'ROOT', '-']
['John', "'s", 'place', '?']


In [112]:
import spacy
nlp = spacy.load("en_core_web_md")

print(nlp.pipe_names)

#text = "let me know brian elliot's office"
text = "John's place?"
doc = nlp(text)
print([token.head.i for token in doc])
print([token.dep_ for token in doc])
# print([token.pos_ for token in doc])
print([token.text for token in doc])

['tagger', 'parser', 'ner']
[2, 0, 2, 2]
['poss', 'case', 'ROOT', 'punct']
['John', "'s", 'place', '?']


In [113]:
for sent in doc.sents:
    print(sent)
    
for ent in doc.ents:
    print(ent.label_, ent.text)

John's place?
PERSON John


In [117]:
from collections import defaultdict

deps = [token.dep_ for token in doc]
root = [i for i, dep in enumerate(deps) if dep == 'ROOT']
person = [i for i, dep in enumerate(deps) if dep == 'PERSON']
place = [i for i, dep in enumerate(deps) if dep == 'PLACE']

entities = defaultdict(list)
entities_to_have = ["PERSON", "PLACE"]
intent = None
intent_find_words = ["find", "search", "know", "wanna", "want"]
entity_office_words = ["office", "room", "place"]

if len(root) > 1 or len(person) > 2 or len(place) > 1:
    print("too complex for this version")
else:
    temp = []
    for i, t in enumerate(doc):
        if t.dep_ in entities_to_have and t.head.i == root[0]:
            entities[t.dep_].append(str(t))
            temp.append(i)

    while(len(temp)):
        temp_copy = temp.copy()
        temp = []
        for i, t in enumerate(doc):
            if t.dep_ in entities_to_have and t.head.i in temp_copy:
                entities[t.dep_].append(str(t))
                temp.append(i)

    root_string = str(doc[root[0]]).lower()

    if root_string in entity_office_words:
        intent = "FIND_OFFICE_LOC"
    if root_string in intent_find_words:
        found = False
        for ent_A in entity_office_words:
            for ent_B in entities.get("PLACE",[]):
                if ent_A == ent_B.lower():
                    found = True
                    break
        if found:
            intent = "FIND_OFFICE_LOC"

print(root_string)
print(intent)
print(entities)


place
FIND_OFFICE_LOC
defaultdict(<class 'list'>, {'PERSON': ['John']})
