In [2]:
import json

from sudachipy import tokenizer
from sudachipy import dictionary
from sudachipy import config
import json

with open(config.SETTINGFILE, "r", encoding="utf-8") as f:
    settings = json.load(f)

print(config.SETTINGFILE)
print(json.dumps(settings, indent=2, ensure_ascii=False))

/Users/xiaofeiwu/jcloud/assets/langs/workspace/rasa/japanese/src/sudachipy/sudachipy/../resources/sudachi.json
{
  "systemDict": "system.dic",
  "characterDefinitionFile": "char.def",
  "inputTextPlugin": [
    {
      "class": "com.worksap.nlp.sudachi.DefaultInputTextPlugin"
    }
  ],
  "oovProviderPlugin": [
    {
      "class": "com.worksap.nlp.sudachi.MeCabOovProviderPlugin",
      "charDef": "char.def",
      "unkDef": "unk.def"
    },
    {
      "class": "com.worksap.nlp.sudachi.SimpleOovProviderPlugin",
      "oovPOS": [
        "補助記号",
        "一般",
        "*",
        "*",
        "*",
        "*"
      ],
      "leftId": 5968,
      "rightId": 5968,
      "cost": 3857
    }
  ],
  "pathRewritePlugin": [
    {
      "class": "com.worksap.nlp.sudachi.JoinNumericPlugin",
      "joinKanjiNumeric": true
    },
    {
      "class": "com.worksap.nlp.sudachi.JoinKatakanaOovPlugin",
      "oovPOS": [
        "名詞",
        "普通名詞",
        "一般",
        "*",
        "*",
        "*"


In [4]:
tokenizer_obj = dictionary.Dictionary(settings).create()

In [6]:
def wakati_by_sudachi(text):
    """
    sudachiを使った分かち書き
    """
    mode = tokenizer.Tokenizer.SplitMode.C #モードCの一番長い形で分ける
    results =[m.surface() for m in tokenizer_obj.tokenize(mode, text)]
    word_list = []
    for mrph in results:
        if not (mrph == ""): #何故か分かち書きの結果として空白データ（''）ができたための省く処理
            seikika = tokenizer_obj.tokenize(mode,mrph)[0].normalized_form() #正規化（標準化？）してなるべく言葉の揺れを無くす　e.g. 打込む → 打ち込む かつ丼 → カツ丼
            hinsi = tokenizer_obj.tokenize(mode,seikika)[0].part_of_speech()[0]
            if hinsi in  ["名詞", "動詞", "形容詞"]:  # 対象とする品詞を指定
                word = tokenizer_obj.tokenize(mode,seikika)[0].dictionary_form()
                word_list.append(word)
    return " ".join(word_list) #スペースで繋げていく

wakati_by_sudachi('個人の感想です')

'個人の感想です'

In [8]:
# Multi-granular tokenization
# (following results are w/ `system_full.dic`
# you may not be able to replicate this particular example w/ `system_core.dic`)


mode = tokenizer.Tokenizer.SplitMode.C
[m.surface() for m in tokenizer_obj.tokenize(mode, "医薬品安全管理責任者")]
# => ['医薬品', '安全', '管理責任者']

['医薬品安全管理責任者']

In [9]:
mode = tokenizer.Tokenizer.SplitMode.B
[m.surface() for m in tokenizer_obj.tokenize(mode, "医薬品安全管理責任者")]

['医薬品', '安全', '管理', '責任者']

In [6]:
[m.reading_form() for m in tokenizer_obj.tokenize(mode, "医薬品安全管理責任者")]

['イヤクヒン', 'アンゼン', 'カンリ', 'セキニンシャ']

In [19]:
import icu
tr = icu.Transliterator.createInstance('Any-Latin; Latin-ASCII').transliterate
sents="試合はいつですか？"
sents=sents.translate({ord(i): None for i in '、。！？'})
mode = tokenizer.Tokenizer.SplitMode.B
' '.join([tr(m.reading_form()) for m in tokenizer_obj.tokenize(mode, sents)])

'shiai ha itsu desu ka'

In [10]:
mode = tokenizer.Tokenizer.SplitMode.A
[m.surface() for m in tokenizer_obj.tokenize(mode, "医薬品安全管理責任者")]

['医薬', '品', '安全', '管理', '責任', '者']

In [11]:
# Morpheme information

m = tokenizer_obj.tokenize(mode, "食べ")[0]
m.surface() # => '食べ'

'食べ'

In [13]:
print([m.dictionary_form(), # => '食べる'
    m.reading_form(), # => 'タベ'
    m.part_of_speech()]) # => ['動詞', '一般', '*', '*', '下一段-バ行', '連用形-一般']

['食べる', 'タベ', ['動詞', '一般', '*', '*', '下一段-バ行', '連用形-一般']]


In [14]:
# Normalization

tokenizer_obj.tokenize(mode, "附属")[0].normalized_form()

'付属'

In [15]:
tokenizer_obj.tokenize(mode, "SUMMER")[0].normalized_form()
# => 'サマー'

'サマー'

In [16]:
tokenizer_obj.tokenize(mode, "シュミレーション")[0].normalized_form()
# => 'シミュレーション'

'シミュレーション'

In [6]:
import json
from sudachipy import tokenizer, dictionary, config
with open(config.SETTINGFILE, "r", encoding="utf-8") as f:
    settings = json.load(f)
tokenizer_obj = dictionary.Dictionary(settings).create()

sents='東京都へ行く'
mode = tokenizer.Tokenizer.SplitMode.C
for m in tokenizer_obj.tokenize(mode, sents):
    print("(%d-%d)"%(m.begin(), m.end()), 
          m.dictionary_form(), 
          m.part_of_speech(),
          m.part_of_speech()[2]
         )

(0-3) 東京都 ['名詞', '固有名詞', '地名', '一般', '*', '*'] 地名
(3-4) へ ['助詞', '格助詞', '*', '*', '*', '*'] *
(4-6) 行く ['動詞', '非自立可能', '*', '*', '五段-カ行', '終止形-一般'] *


In [7]:
from natasha.markup import format_markup_css
class Matches(object):
    __attributes__ = ['text', 'matches']

    def __init__(self, text, matches):
        self.text = text
        self.matches = matches

    def __iter__(self):
        return iter(self.matches)

    def __getitem__(self, index):
        return self.matches[index]

    def __len__(self):
        return len(self.matches)

    def __bool__(self):
        return bool(self.matches)

    def _repr_html_(self):
        spans = [(_[0],_[1]) for _ in self.matches]
        return ''.join(format_markup_css(self.text, spans))

In [13]:
def get_entities(sents):
    entities=[(m.begin(), m.end(), m.part_of_speech()[2]) 
              for m in tokenizer_obj.tokenize(mode, sents) 
              if m.part_of_speech()[2]!='*']
    matches = sorted(entities, key=lambda _: _[0])
    return matches
sents='東京都へ行く'
Matches(sents, get_entities(sents))

In [14]:
def get_segs(sents):
    matches=get_entities(sents)
    return [ent[2]+'_'+sents[ent[0]:ent[1]] for ent in matches]
get_segs('東京都へ行く')

['地名_東京都']

In [22]:
def print_entities(sents):
    for m in tokenizer_obj.tokenize(mode, sents):
        print("(%d-%d)"%(m.begin(), m.end()), 
              m.dictionary_form(), 
              m.part_of_speech(),
              m.part_of_speech()[2]
             )
def entities_df(sents):
    import sagas
    rs=[]
    for m in tokenizer_obj.tokenize(mode, sents):
        rs.append(("(%d-%d)"%(m.begin(), m.end()), 
              m.dictionary_form(), 
              m.part_of_speech(),
              m.part_of_speech()[2]
             ))
    return sagas.to_df(rs, ['span', 'word', 'pos', 'entity'])

In [23]:
text='その博物館はまだ開いていません。'
# print_entities(text)
print("(%s)"%'; '.join(get_segs(text)))
entities_df(text)

(一般_博物館)


Unnamed: 0,span,word,pos,entity
0,(0-2),その,"[連体詞, *, *, *, *, *]",*
1,(2-5),博物館,"[名詞, 普通名詞, 一般, *, *, *]",一般
2,(5-6),は,"[助詞, 係助詞, *, *, *, *]",*
3,(6-8),まだ,"[副詞, *, *, *, *, *]",*
4,(8-10),開く,"[動詞, 一般, *, *, 五段-カ行, 連用形-イ音便]",*
5,(10-11),て,"[助詞, 接続助詞, *, *, *, *]",*
6,(11-12),いる,"[動詞, 非自立可能, *, *, 上一段-ア行, 連用形-一般]",*
7,(12-14),ます,"[助動詞, *, *, *, 助動詞-マス, 未然形-一般]",*
8,(14-15),ぬ,"[助動詞, *, *, *, 助動詞-ヌ, 終止形-撥音便]",*
9,(15-16),。,"[補助記号, 句点, *, *, *, *]",*
