In [1]:
import sys

from google.protobuf.json_format import MessageToJson
from client_wrapper import ServiceClient

import nlpserv_pb2 as nlp_messages
import nlpserv_pb2_grpc as nlp_service

In [25]:
from cabocha.analyzer import CaboChaAnalyzer
from rasa_nlu.tokenizers import Tokenizer, Token

def tokenize(text):         
    analyzer = CaboChaAnalyzer()
    tree = analyzer.parse(text)
    words=[]
    for chunk in tree:        
        for token in chunk:
            # print(token, token.pos)
            words.append(token.surface)

    running_offset = 0
    tokens = []
    for word in words:
        word_offset = text.index(word, running_offset)
        word_len = len(word)
        running_offset = word_offset + word_len
        tokens.append(Token(word, word_offset))   
    return tokens

tokens=tokenize("お皿を二枚ください。")
for t in tokens:
    print(t.text, t.offset)

お 0
皿 1
を 2
二 3
枚 4
ください 5
。 9


In [31]:
text="お皿を二枚ください。"

analyzer = CaboChaAnalyzer()
tree = analyzer.parse(text)
msg_chunks=nlp_messages.NlCabochaChunks()
chunks=[]
for chunk in tree:
    msg_chunk=nlp_messages.NlCabochaChunk()
    msg_chunk.id=chunk.id
    if not chunk.additional_info is None:
        msg_chunk.additional_info=chunk.additional_info
    msg_chunk.feature_list.extend(chunk.feature_list)
    msg_chunk.func_pos=chunk.func_pos
    msg_chunk.head_pos=chunk.head_pos
    msg_chunk.link=chunk.link
    msg_chunk.score=chunk.score
    msg_chunk.token_pos=chunk.token_pos
    msg_chunk.next_link_id=chunk.next_link_id
    msg_chunk.prev_link_ids.extend(chunk.prev_link_ids)
    
    words=[]
    for token in chunk:
        # print(token, token.pos)
        word=nlp_messages.NlCabochaToken(surface=token.surface,
                                         id=token.id,
                                         additional_info=token.additional_info,
                                         feature_list=token.feature_list,
                                         ne=token.ne,
                                         normalized_surface=token.normalized_surface,
                                         pos=token.pos,
                                         pos1=token.pos1,
                                         pos2=token.pos2,
                                         pos3=token.pos3,
                                         ctype=token.ctype,
                                         cform=token.cform,
                                         genkei=token.genkei,
                                         yomi=token.yomi
                                        )
        words.append(word)
    msg_chunk.tokens.extend(words)
    chunks.append(msg_chunk)
    
msg_chunks.chunks.extend(chunks)
for chunk in msg_chunks.chunks:
    print(chunk.id, [word.surface for word in chunk.tokens])

0 ['お', '皿', 'を']
1 ['二', '枚']
2 ['ください', '。']


In [33]:
from rasa_nlu.tokenizers import Tokenizer, Token

def tokenize_msg(text, msg_chunks):         
    words=[]
    for chunk in msg_chunks.chunks:        
        for token in chunk.tokens:
            # print(token, token.pos)
            words.append(token.surface)

    running_offset = 0
    tokens = []
    for word in words:
        word_offset = text.index(word, running_offset)
        word_len = len(word)
        running_offset = word_offset + word_len
        tokens.append(Token(word, word_offset))   
    return tokens

tokens=tokenize_msg("お皿を二枚ください。", msg_chunks)
for t in tokens:
    print(t.text, t.offset)

お 0
皿 1
を 2
二 3
枚 4
ください 5
。 9
