In [1]:
using PyCall
MeCab = pyimport("MeCab")
mecab = MeCab.Tagger("-Ochasen -d /usr/lib/x86_64-linux-gnu/mecab/dic/mecab-ipadic-neologd")
simple_mecab = MeCab.Tagger("")
normarize_word(word) =
    Unicode.normalize(word, :NFKC) |>
    word -> replace(word, r"\d+" => s"0")
normarize(words) = map(normarize_word, words)
words(str) =
    mecab.parse(str) |>
    node -> split(node, "\n") |>
    node -> map(node -> split(node, "\t"), node) |>
    node -> filter(node -> length(node) == 6, node)
simple_words(str) =
    simple_mecab.parse(str) |>
    node -> split(node, "\n") |>
    node -> map(node -> split(node, "\t"), node) |>
    node -> filter(node -> length(node) == 8, node)
surface(word) = word[begin]
yomi(word) = word[2]
lemma(word) = word[3]
raw_pos(word) = word[4]
pos(word) = split(raw_pos(word), "-")[begin]

pos (generic function with 1 method)

In [2]:
simple_words("やさしい日本語を")

4-element Vector{Vector{SubString{String}}}:
 ["やさしい", "ヤサシー", "ヤサシイ", "優しい", "形容詞-一般", "形容詞", "連体形-一般", "0,3"]
 ["日本", "ニッポン", "ニッポン", "日本", "名詞-固有名詞-地名-国", "", "", "3"]
 ["語", "ゴ", "ゴ", "語", "名詞-普通名詞-一般", "", "", "1"]
 ["を", "オ", "ヲ", "を", "助詞-格助詞", "", "", ""]

In [3]:
using Random, StatsBase

function gen_raw_word(word)
    if lemma(word) == "，"
        return ","
    elseif lemma(word) == "．"
        return "."
    elseif lemma(word) == "、"
        return ","
    elseif lemma(word) == "。"
        return "."
    elseif lemma(word) == "！"
        return "!"
    elseif lemma(word) == "？"
        return "?"
    elseif lemma(word) == "（"
        return "("
    elseif lemma(word) == "）"
        return ")"
    end
    seed = MersenneTwister(parse(Int128, SubString(word |> lemma |> md5 |> bytes2hex, 1, 16), base=16))
    new_word = ""
    word_len = if pos(word) == "名詞" || pos(word) == "代名詞"
        length(word |> yomi) * 2
    else
        length(word |> lemma) * 2
    end
    for i in 1:word_len
        if i % 2 == 0
            v, w = zip(
                ("a", 8.2),
                ("e", 13.),
                ("i", 7.),
                ("o", 7.5),
                ("u", 2.8),
                ("aa", 2.42),
                ("ee", 3.25),
                ("ii", 1.75),
                ("oo", 1.875),
                ("uu", 0.7),
                ("ou", 0.79),
                ("ea", 0.60),
                ("ai", 0.47),
                ("ei", 0.12),
                ("oi", 0.95),
            ) |> collect |> x -> map(t -> collect(t), x)
            new_word *= sample(seed, v, Weights(w))
        else
            c, w = if i == 1
                zip(
                    ("b", 1.5),
                    ("c", 2.8),
                    ("d", 4.3),
                    ("f", 2.2),
                    ("g", 2.),
                    ("h", 6.1),
                    ("j", 2),
                    ("k", 0.77),
                    ("l", 4),
                    ("m", 2.4),
                    ("n", 6.7),
                    ("p", 1.9),
                    ("q", 0.095),
                    ("r", 6.),
                    ("s", 6.3),
                    ("t", 9.1),
                    ("v", 0.98),
                    ("w", 2.4),
                    ("x", 0.15),
                    ("y", 0.15),
                    ("z", 0.074),
                ) |> collect |> x -> map(t -> collect(t), x)
            else
                zip(
                    ("b", 1.5),
                    ("c", 2.8),
                    ("d", 4.3),
                    ("f", 2.2),
                    ("g", 2.),
                    ("h", 6.1),
                    ("j", 2),
                    ("k", 0.77),
                    ("l", 4),
                    ("m", 2.4),
                    ("n", 6.7),
                    ("p", 1.9),
                    ("q", 0.095),
                    ("r", 6.),
                    ("s", 6.3),
                    ("t", 9.1),
                    ("v", 0.98),
                    ("w", 2.4),
                    ("x", 0.15),
                    ("y", 0.15),
                    ("z", 0.074),
                    ("st", 0.91),
                    ("tr", 0.30),
                    ("ll", 1.49),
                    ("tt", 0.94),
                ) |> collect |> x -> map(t -> collect(t), x)
            end
            new_word *= sample(seed, c, Weights(w))
        end
    end
    return new_word
end

gen_raw_word (generic function with 1 method)

In [4]:
function gen_word(word)
    join(gen_raw_word.(map(x -> x[setdiff(1:length(x), [2])], simple_words(word))))
end

gen_word (generic function with 1 method)

In [5]:
function format(words)
    text = ""
    first = true
    for word in words
        gw = gen_word(surface(word))
        if match(r"[.!?]", gw) !== nothing
            text *= gw
            first = true
        elseif match(r"[,()]", gw) !== nothing
            text *= gw
        elseif lemma(word) == "は" && pos(word) == "助詞"
            text *= ""
        elseif lemma(word) == "が" && pos(word) == "助詞"
            text *= "s"
        elseif lemma(word) == "の" && pos(word) == "助詞"
            last_word = split(text)[end]
            if (last_word[end - 1] != last_word[end])
                text *= last_word[end]
            end
        elseif lemma(word) == "を" && pos(word) == "助詞"
            text *= "n"
        elseif pos(word) == "助詞" || pos(word) == "助動詞" || match(r"名詞-非自立", raw_pos(word)) !== nothing
            text *= gw
        elseif match(r"[A-Za-z]", surface(word)) !== nothing
            text *= " " * uppercasefirst(surface(word))
            first = false
        else
            if first
                text *= " " * uppercasefirst(gw)
                first = false
            else
                text *= " " * gw
            end
        end
    end
    return strip(text)
end

format (generic function with 1 method)

In [6]:
translate(japanese) = format(words(japanese))

translate (generic function with 1 method)

In [7]:
using MD5
text = "こんにちは！Bosa Nocenooniiです．僕は日本語を話します．"
translate(text), join(yomi.(words(text)), " ")

("Gitosii! Bosa Nocenooniiseje. Deree weereestelegan tatutadaa.", "コンニチハ ！ Bosa Nocenoonii デス ． ボク ハ ニホンゴ ヲ ハナシ マス ．")

In [8]:
simple_words("こんにちは！Bosa Nocenooniiです．僕は日本語を話します．")

14-element Vector{Vector{SubString{String}}}:
 ["こんにちは", "コンニチワ", "コンニチハ", "今日は", "感動詞-一般", "", "", "5"]
 ["！", "", "", "！", "補助記号-句点", "", "", ""]
 ["Bosa", "Bosa", "Bosa", "Bosa", "名詞-普通名詞-一般", "", "", "0"]
 ["Nocenoonii", "Nocenoonii", "Nocenoonii", "Nocenoonii", "名詞-普通名詞-一般", "", "", "0"]
 ["です", "デス", "デス", "です", "助動詞", "助動詞-デス", "終止形-一般", ""]
 ["．", "", "", "．", "補助記号-句点", "", "", ""]
 ["僕", "ボク", "ボク", "僕", "代名詞", "", "", "1,0"]
 ["は", "ワ", "ハ", "は", "助詞-係助詞", "", "", ""]
 ["日本", "ニッポン", "ニッポン", "日本", "名詞-固有名詞-地名-国", "", "", "3"]
 ["語", "ゴ", "ゴ", "語", "名詞-普通名詞-一般", "", "", "1"]
 ["を", "オ", "ヲ", "を", "助詞-格助詞", "", "", ""]
 ["話し", "ハナシ", "ハナス", "話す", "動詞-一般", "五段-サ行", "連用形-一般", "2"]
 ["ます", "マス", "マス", "ます", "助動詞", "助動詞-マス", "終止形-一般", ""]
 ["．", "", "", "．", "補助記号-句点", "", "", ""]