In [1]:
import sagas

In [2]:
sagas.print_rs([('tom',5)], ['name','age'])

+----+--------+-------+
|    | name   |   age |
|----+--------+-------|
|  0 | tom    |     5 |
+----+--------+-------+


In [5]:
from rasa.nlu import config
conf=config.load('saai/sample_configs/config_crf_custom_features.yml')
# conf.for_component('DucklingHTTPExtractor')
conf.component_names

['SpacyNLP',
 'SpacyTokenizer',
 'SpacyEntityExtractor',
 'DucklingHTTPExtractor',
 'CRFEntityExtractor',
 'SklearnIntentClassifier']

In [7]:
conf.language

'en'

In [10]:
# print(conf.get('DucklingHTTPExtractor'))

In [1]:
from rasa.nlu.training_data import TrainingData, Message
from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer

def testing_tokenizer(text, cls, lang='en'):
    defaults = {
        # Flag to check whether to split intents
        "intent_tokenization_flag": False,
        # Symbol on which intent should be split
        "intent_split_symbol": "_",
        # text will be tokenized with case sensitive as default
        "case_sensitive": True,
        "lang": lang,
    }

    tok=cls(defaults)    
    example = Message(text, {
            "intent": "wish",
            "entities": []})
    # tokenizer
    tok.process(example, x='.')
    for token in example.get("tokens"):
        print(token.text, token.offset)

text='text will be tokenized with case sensitive as default'
testing_tokenizer(text, WhitespaceTokenizer)

text 0
will 5
be 10
tokenized 13
with 23
case 28
sensitive 33
as 43
default 46


In [4]:
from sagas.util.rest_common import query_data_by_url
lang='zh'
sents="在终端上输出单词的定义和继承链"
r=query_data_by_url('multilang', 'tokens', {'lang': lang, 'sents': sents})
r['data']

['在', '终端', '上', '输出', '单词', '的', '定义', '和', '继承链']

In [10]:
from typing import Any, Dict, List, Text
from rasa.nlu.tokenizers import Token, Tokenizer
from rasa.nlu.constants import (
    MESSAGE_RESPONSE_ATTRIBUTE,
    MESSAGE_INTENT_ATTRIBUTE,
    MESSAGE_TEXT_ATTRIBUTE,
    MESSAGE_TOKENS_NAMES,
    MESSAGE_ATTRIBUTES,
    MESSAGE_SPACY_FEATURES_NAMES,
    MESSAGE_VECTOR_FEATURE_NAMES,
)

class MultilangTokenizer(WhitespaceTokenizer):
    def __init__(self, component_config: Dict[Text, Any] = None) -> None:
        """Construct a new tokenizer using the WhitespaceTokenizer framework."""
        super().__init__(component_config)
        self.lang = self.component_config["lang"]
        print(f".. tokenizer with lang {self.lang}")
        
    def tokenize(
        self, text: Text, attribute: Text = MESSAGE_TEXT_ATTRIBUTE
    ) -> List[Token]:
        if self.lang in ('zh','ja'):
            r=query_data_by_url('multilang', 'tokens', {'lang': self.lang, 'sents': text})
            words=r['data']
            running_offset = 0
            tokens = []
            for word in words:
                word_offset = text.index(word, running_offset)
                word_len = len(word)
                running_offset = word_offset + word_len
                tokens.append(Token(word, word_offset))
            return tokens
        return super().tokenize(text, attribute)
        
text='text will be tokenized with case sensitive as default'
print(testing_tokenizer(text, MultilangTokenizer, 'en'))

.. tokenizer with lang en
text 0
will 5
be 10
tokenized 13
with 23
case 28
sensitive 33
as 43
default 46
None


In [9]:
text="在终端上输出单词的定义和继承链"
print(testing_tokenizer(text, MultilangTokenizer, 'zh'))
print(testing_tokenizer("望遠鏡で泳いでいる少女を見た。", MultilangTokenizer, 'ja'))

.. tokenizer with lang zh
在 0
终端 1
上 3
输出 4
单词 6
的 8
定义 9
和 11
继承链 12
None
.. tokenizer with lang ja
望遠 0
鏡 2
で 3
泳いで 4
いる 7
少女 9
を 11
見た 12
。 14
None
