In [2]:
from rasa_nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
tk = WhitespaceTokenizer()
assert [t.text for t in tk.tokenize("Forecast for lunch")] == \
    ['Forecast', 'for', 'lunch']
assert [t.text for t in tk.tokenize("привет! 10.000, ńöñàśçií. how're you?")] == \
       ['привет', '10.000', 'ńöñàśçií', 'how\'re', 'you']

assert [t.offset for t in tk.tokenize("привет! 10.000, ńöñàśçií. how're you?")] == \
       [0, 8, 16, 26, 33]    

In [5]:
import logging
import os

import pytest
from rasa_nlu import data_router, config
from rasa_nlu.components import ComponentBuilder
from rasa_nlu.model import Trainer
from rasa_nlu.utils import zip_folder
from rasa_nlu import training_data

logging.basicConfig(level="DEBUG")

CONFIG_DEFAULTS_PATH = "sample_configs/config_defaults.yml"

DEFAULT_DATA_PATH = "data/examples/rasa/demo-rasa.json"

TEST_MODEL_PATH = "test_models/test_model_spacy_sklearn"


@pytest.fixture(scope="session")
def component_builder():
    return ComponentBuilder()


@pytest.fixture(scope="session")
def spacy_nlp(component_builder, default_config):
    return component_builder.create_component("nlp_spacy", default_config).nlp


@pytest.fixture(scope="session")
def ner_crf_pos_feature_config():
    return {
        "features": [
            ["low", "title", "upper", "pos", "pos2"],
            ["bias", "low", "suffix3", "suffix2", "upper",
             "title", "digit", "pos", "pos2", "pattern"],
            ["low", "title", "upper", "pos", "pos2"]]
    }


@pytest.fixture(scope="session")
def mitie_feature_extractor(component_builder, default_config):
    return component_builder.create_component("nlp_mitie",
                                              default_config).extractor


@pytest.fixture(scope="session")
def default_config():
    return config.load(CONFIG_DEFAULTS_PATH)

spacy_nlp=spacy_nlp(component_builder(), default_config())

  from ._conv import register_converters as _register_converters
INFO:rasa_nlu.utils.spacy_utils:Trying to load spacy model with name 'en'
INFO:rasa_nlu.components:Added 'nlp_spacy' to component cache. Key 'nlp_spacy-en'.


In [6]:
from rasa_nlu.tokenizers.spacy_tokenizer import SpacyTokenizer
tk = SpacyTokenizer()
text = "Forecast for lunch"
assert [t.text for t in tk.tokenize(spacy_nlp(text))] == \
       ['Forecast', 'for', 'lunch']
assert [t.offset for t in tk.tokenize(spacy_nlp(text))] == \
       [0, 9, 13]

text = "hey ńöñàśçií how're you?"
assert [t.text for t in tk.tokenize(spacy_nlp(text))] == \
       ['hey', 'ńöñàśçií', 'how', '\'re', 'you', '?']
assert [t.offset for t in tk.tokenize(spacy_nlp(text))] == \
       [0, 4, 13, 16, 20, 23]

In [7]:
def test_jieba():
    from rasa_nlu.tokenizers.jieba_tokenizer import JiebaTokenizer
    tk = JiebaTokenizer()

    assert [t.text for t in tk.tokenize("我想去吃兰州拉面")] == \
           ['我', '想', '去', '吃', '兰州', '拉面']

    assert [t.offset for t in tk.tokenize("我想去吃兰州拉面")] == \
           [0, 1, 2, 3, 4, 6]

    assert [t.text for t in tk.tokenize("Micheal你好吗？")] == \
           ['Micheal', '你好', '吗', '？']

    assert [t.offset for t in tk.tokenize("Micheal你好吗？")] == \
           [0, 7, 9, 10]

test_jieba()

Building prefix dict from the default dictionary ...
DEBUG:jieba:Building prefix dict from the default dictionary ...
Dumping model to file cache /var/folders/fv/7k1qk5v11dn33sdcngv2wbnm0000gn/T/jieba.cache
DEBUG:jieba:Dumping model to file cache /var/folders/fv/7k1qk5v11dn33sdcngv2wbnm0000gn/T/jieba.cache
Loading model cost 0.999 seconds.
DEBUG:jieba:Loading model cost 0.999 seconds.
Prefix dict has been built succesfully.
DEBUG:jieba:Prefix dict has been built succesfully.


In [9]:
from rasa_nlu.config import RasaNLUModelConfig
from rasa_nlu.extractors.spacy_entity_extractor import SpacyEntityExtractor
from rasa_nlu.training_data import TrainingData, Message

def test_spacy_ner_extractor(spacy_nlp):
    ext = SpacyEntityExtractor()
    example = Message("anywhere in the West", {
        "intent": "restaurant_search",
        "entities": [],
        "spacy_doc": spacy_nlp("anywhere in the west")})

    ext.process(example, spacy_nlp=spacy_nlp)

    assert len(example.get("entities", [])) == 1
    assert example.get("entities")[0] == {
        'start': 16,
        'extractor': 'ner_spacy',
        'end': 20,
        'value': 'West',
        'entity': 'LOC',
        'confidence': None}

test_spacy_ner_extractor(spacy_nlp)

In [10]:
def test_crf_extractor(spacy_nlp, ner_crf_pos_feature_config):
    from rasa_nlu.extractors.crf_entity_extractor import CRFEntityExtractor
    ext = CRFEntityExtractor(component_config=ner_crf_pos_feature_config)
    examples = [
        Message("anywhere in the west", {
            "intent": "restaurant_search",
            "entities": [{"start": 16, "end": 20,
                          "value": "west", "entity": "location"}],
            "spacy_doc": spacy_nlp("anywhere in the west")
        }),
        Message("central indian restaurant", {
            "intent": "restaurant_search",
            "entities": [
                {"start": 0, "end": 7, "value": "central",
                 "entity": "location", "extractor": "random_extractor"},
                {"start": 8, "end": 14, "value": "indian",
                 "entity": "cuisine", "extractor": "ner_crf"}
            ],
            "spacy_doc": spacy_nlp("central indian restaurant")
        })]

    # uses BILOU and the default features
    ext.train(TrainingData(training_examples=examples), RasaNLUModelConfig())
    sentence = 'anywhere in the west'
    doc = {"spacy_doc": spacy_nlp(sentence)}
    crf_format = ext._from_text_to_crf(Message(sentence, doc))
    assert [word[0] for word in crf_format] == ['anywhere', 'in', 'the', 'west']
    feats = ext._sentence_to_features(crf_format)
    assert 'BOS' in feats[0]
    assert 'EOS' in feats[-1]
    assert feats[1]['0:low'] == "in"
    sentence = 'anywhere in the west'
    ext.extract_entities(Message(sentence, {"spacy_doc": spacy_nlp(sentence)}))
    filtered = ext.filter_trainable_entities(examples)
    assert filtered[0].get('entities') == [
        {"start": 16, "end": 20, "value": "west", "entity": "location"}
    ], 'Entity without extractor remains'
    assert filtered[1].get('entities') == [
        {"start": 8, "end": 14,
         "value": "indian", "entity": "cuisine", "extractor": "ner_crf"}
    ], 'Only ner_crf entity annotation remains'
    assert examples[1].get('entities')[0] == {
        "start": 0, "end": 7,
        "value": "central", "entity": "location",
        "extractor": "random_extractor"
    }, 'Original examples are not mutated'

test_crf_extractor(spacy_nlp, ner_crf_pos_feature_config())

INFO:rasa_nlu.training_data.training_data:Training data stats: 
	- intent examples: 2 (1 distinct intents)
	- Found intents: 'restaurant_search'
	- entity examples: 2 (2 distinct entities)
	- found entities: 'cuisine', 'location'



In [16]:
def test_duckling_entity_extractor(component_builder):
    _config = RasaNLUModelConfig({"pipeline": [{"name": "ner_duckling"}]})
    _config.set_component_attr("ner_duckling", dimensions=["time"])
    duckling = component_builder.create_component("ner_duckling", _config)
    message = Message("Today is the 5th of May. Let us meet tomorrow.")
    duckling.process(message)
    entities = message.get("entities")
    assert len(entities) == 3

    # Test duckling with a defined date

    # 1381536182000 == 2013/10/12 02:03:02
    message = Message("Let us meet tomorrow.", time="1381536182000")
    duckling.process(message)
    entities = message.get("entities")
    assert len(entities) == 1
    assert entities[0]["text"] == "tomorrow"
    assert entities[0]["value"] == "2013-10-13T00:00:00.000Z"


def test_duckling_entity_extractor_and_synonyms(component_builder):
    _config = RasaNLUModelConfig({"pipeline": [{"name": "ner_duckling"}]})
    _config.set_component_attr("ner_duckling", dimensions=["number"])
    duckling = component_builder.create_component("ner_duckling", _config)
    synonyms = component_builder.create_component("ner_synonyms", _config)
    message = Message("He was 6 feet away")
    duckling.process(message)
    # checks that the synonym processor can handle entities that have int values
    synonyms.process(message)
    assert message is not None

test_duckling_entity_extractor(component_builder())

DEBUG:root:Passing reference time 2013-10-12T00:03:02+00:00 to duckling


In [1]:
from sagas.ja.japanese_tokenizer import JapaneseTokenizer
tk = JapaneseTokenizer()
print([t.text for t in tk.tokenize("お皿を二枚ください。")])
print([t.offset for t in tk.tokenize("お皿を二枚ください。")])

['お', '皿', 'を', '二', '枚', 'ください', '。']
[0, 1, 2, 3, 4, 5, 9]


In [14]:
from rasa_nlu.tokenizers.jieba_tokenizer import JiebaTokenizer
tk = JiebaTokenizer()
print([t.text for t in tk.tokenize("我想去吃兰州拉面")])

['我', '想', '去', '吃', '兰州', '拉面']
