In [11]:
from __future__ import unicode_literals, print_function

import random
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding



names = ["Donna", "donna", "Mandy", "mandy", "Colin", "colin", "Janis", "janis", "Brian", "brian"]

TRAIN_DATA = []

for name in names:
    TRAIN_DATA.append(("Find me {}'s office".format(name), 
              { "heads": [0, 0, 4, 2, 0],  # index of token head
                "deps": ["ROOT", "-", "PERSON", "-", "PLACE"]}))
    TRAIN_DATA.append(("Could you find me {}'s office".format(name), 
              { "heads": [2, 2, 2, 2, 6, 4, 2],  # index of token head
                "deps": ["-", "-", "ROOT", "-", "PERSON", "-", "PLACE"]}))
    TRAIN_DATA.append(("Where is {}'s office?".format(name), 
              { "heads": [1, 1, 4, 2, 1, 1],  # index of token head
                "deps": ["-", "ROOT", "PERSON", "-", "PLACE", "-"]}))
    TRAIN_DATA.append(("Where can I find {}'s office?".format(name), 
              { "heads": [3, 3, 3, 3, 6, 4, 3, 3],  # index of token head
                "deps": ["-", "-", "-", "ROOT", "PERSON", "-", "PLACE", "-"]}))
    

def main(model=None, output_dir=None, n_iter=15):
    """Load the model, set up the pipeline and train the parser."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")

    # We'll use the built-in dependency parser class, but we want to create a
    # fresh instance – just in case.
    if "parser" in nlp.pipe_names:
        nlp.remove_pipe("parser")
    parser = nlp.create_pipe("parser")
    nlp.add_pipe(parser, first=True)

    for text, annotations in TRAIN_DATA:
        for dep in annotations.get("deps", []):
            parser.add_label(dep)

    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "parser"]
    with nlp.disable_pipes(*other_pipes):  # only train parser
        optimizer = nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, losses=losses)
            print("Losses", losses)

    # test the trained model
    test_model(nlp)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        test_model(nlp2)


def test_model(nlp):
    texts = [
        "find office",
        "find Donna's office",
        "find me Mandy Green's office",
        "how can I find Donna's office"
    ]
    docs = nlp.pipe(texts)
    for doc in docs:
        print(doc.text)
        print([(t.text, t.dep_, t.head.text) for t in doc if t.dep_ != "-"])

In [48]:
#main("en_core_web_md", output_dir="./model")
main("./model", output_dir="./model")

Loaded model './model'
Losses {'parser': 75.1100008636713}
Losses {'parser': 71.07219618558884}
Losses {'parser': 45.04308149218559}
Losses {'parser': 17.65409846464172}
Losses {'parser': 2.575062443036586}
Losses {'parser': 0.12528935682712472}
Losses {'parser': 0.0011910768143366113}
Losses {'parser': 1.2815568531165944e-05}
Losses {'parser': 3.516869703512615e-07}
Losses {'parser': 4.8024628444137685e-08}
Losses {'parser': 1.7055659417989158e-08}
Losses {'parser': 7.129709477752405e-09}
Losses {'parser': 3.4049019469791636e-09}
Losses {'parser': 1.6048069865621216e-09}
Losses {'parser': 1.0090771881474787e-09}
find office
[('find', 'ROOT', 'find'), ('office', 'PLACE', 'find')]
find Donna's office
[('find', 'ROOT', 'find'), ('Donna', 'PERSON', 'office'), ('office', 'PLACE', 'find')]
find me Mandy Green's office
[('find', 'ROOT', 'find'), ('Mandy', 'PERSON', 'office'), ('office', 'PLACE', 'find')]
how can I find Donna's office
[('find', 'ROOT', 'find'), ('Donna', 'PERSON', 'office'), 

In [49]:
import spacy
nlp = spacy.load("./model")
print(nlp.pipe_names)

text = "Hello, where is Brian's office?"
doc = nlp(text)
print([token.head.i for token in doc])
print([token.dep_ for token in doc])
# print([token.pos_ for token in doc])
print([token.text for token in doc])

['parser', 'tagger', 'ner']
[3, 2, 3, 3, 6, 4, 3, 3]
['-', '-', '-', 'ROOT', 'PERSON', '-', 'PLACE', '-']
['Hello', ',', 'where', 'is', 'Brian', "'s", 'office', '?']


In [50]:
for ent in doc.ents:
    print(ent.label_, ent.text)

PERSON Brian


In [51]:
deps = [token.dep_ for token in doc]
root = [i for i, dep in enumerate(deps) if dep == 'ROOT']
person = [i for i, dep in enumerate(deps) if dep == 'PERSON']
place = [i for i, dep in enumerate(deps) if dep == 'PLACE']
if len(root) > 0:
    for i in root:
        print(doc[i])
# if str(doc[root]) == 'is' or str(doc[root]) == 'are':
    

is


In [52]:
for sent in doc.sents:
    print(sent)

Hello, where is Brian's office?
