In [181]:
#!pip install alkana
#!conda install -y -c conda-forge mecab-python3
#!pip install unidic
#!python -m unidic download

In [2]:
import requests
import os, time, sys, subprocess
import json
from IPython.display import JSON

In [187]:
import MeCab
import unidic
import pandas as pd
import alkana
import re
import os

# ヘルパ関数：　アルファベットをカタカナに変更
# https://qiita.com/kunishou/items/814e837cf504ce287a13

def alpha_to_kana(text):
    #半角英字判定
    alphaReg = re.compile(r'^[a-zA-Z]+$')
    def isalpha(s):
        return alphaReg.match(s) is not None

    sample_txt = text

    wakati = MeCab.Tagger('-Owakati')
    wakati_result = wakati.parse(sample_txt)
    #print(wakati_result)

    df = pd.DataFrame(wakati_result.split(" "),columns=["word"])
    df = df[df["word"].str.isalpha() == True]
    df["english_word"] = df["word"].apply(isalpha)
    df = df[df["english_word"] == True]
    df["katakana"] = df["word"].apply(alkana.get_kana)

    dict_rep = dict(zip(df["word"], df["katakana"]))

    for word, read in dict_rep.items():
        sample_txt = sample_txt.replace(word, read or "")
    return sample_txt

In [202]:
import socket, json, random, math
vowel_map = {
    "a": (0, 1),
    "i": (0.25, 0.5),
    "e": (0.5, 0.8),
    "o": (0.75, 0.75),
    "u": (1.0, 0.7)
}
consonant_map = {
    "k": (0.25, 0.25),
    "s": (0.5, 0.3),
    "t": (0.5, 0.5),
    "n": (0.5, 0.5),
    "h": (None, 0.6),
    "m": (None, 0),
    "y": (0.75, 0.75),
    "w": (0.75, 0.25),
    "N": (0.5, 0)
}

def make_mouth_map(w):
    mouth_map = {}
    for map in [consonant_map, vowel_map]:
        if w in map:
            for i, var in enumerate(("x", "y")):
                if map[w][i]:
                    mouth_map[f"mouth_{var}"] = map[w][0]
    return mouth_map

def interpolate(action_map, absolute_time, name, start_time, end_time, src_value, dest_value, step = 0.01):
    cur_time = start_time
    while cur_time < end_time:
        pos = (cur_time - start_time) / (end_time - start_time)
        ratio = math.sin(math.pi / 2 * pos)
        action_map.append((absolute_time + cur_time, 
                           absolute_time + cur_time + step, 
                           {name: src_value * (1 - ratio) + dest_value * ratio}))
        cur_time += step

def play_speech(text, speaker=1):
    text = alpha_to_kana(text)
    res1 = requests.post("http://localhost:50021/audio_query", params={"text": text, "speaker": speaker})
    data = res1.json()
#    display(JSON(data))
    wav_res = requests.post("http://localhost:50021/synthesis", params={"speaker": speaker}, json=data)
    wav_data = wav_res.content
    
    path = tempfile.gettempprefix()+".wav"
    
    with open(path, "wb") as f:
        f.write(wav_data)

    action_map = []
    total_time = 0
    start_time = time.time()
    face_pitch = 0.5
    for acc in data["accent_phrases"]:
        accent_start_time = total_time
        for m in acc["moras"]:
            print(m["text"],end="")
            c = m["consonant"]
            if c:
                mouth_map = make_mouth_map(c)
                next_time = total_time + m["consonant_length"]
                action_map.append((start_time + total_time, start_time + next_time, mouth_map))
                total_time = next_time
            v = m["vowel"]
            if v:
                mouth_map = make_mouth_map(v)
                next_time = total_time + m["vowel_length"]
                action_map.append((start_time + total_time, start_time + next_time, mouth_map))
                total_time = next_time
        accent_mid_time = total_time

        dest_face_pitch = random.uniform(0.6, 0.8) if face_pitch >= 0.5 else random.uniform(0.2, 0.4)
        interpolate(action_map, start_time, "face_pitch", accent_start_time, accent_mid_time, face_pitch, dest_face_pitch)
        face_pitch = dest_face_pitch

        if acc["pause_mora"]:
            next_time = total_time + acc["pause_mora"]["vowel_length"]
            action_map.append((start_time + total_time, start_time + next_time, None))
            total_time = next_time

            accent_end_time = total_time
            dest_face_pitch = 0.5
            interpolate(action_map, start_time, "face_pitch", accent_mid_time, accent_end_time, face_pitch, dest_face_pitch)
            face_pitch = dest_face_pitch
    if face_pitch != 0.5:
        interpolate(action_map, time.time(), "face_pitch", 0, 1, face_pitch, 0.5)
    
    # 音声を鳴らす。今の所Linux限定。
    p = subprocess.Popen(["aplay", "-q", path])

    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    s.connect((socket.gethostname(), 9998))
    data = json.dumps(action_map).encode('utf-8')
    s.send(len(data).to_bytes(4, 'big'))
    s.send(data)
    s.close()

    p.wait()
    os.remove(path)

In [204]:
play_speech("これは音声発話のテストなんですよ。", 2)

コレワオンセエハツワノテストナンデスヨ

In [205]:
play_speech("""
ギンプ、マイペイントや自作ソフトなど、お絵描きソフトをつくるための技術やコツを解説する本です。
お絵描きソフトを作ってみたい、でもどこから手を付けたらよいかわからない、という人に送る、業界初の入門書です？
""",
1)

ギンプマイペイントヤジサクソフトナドオエカキソフトオツクルタメノギジュツヤコツオカイセツスルホンデスオエカキソフトオツクッテミタイデモドコカラテオツケタラヨイカワカラナイトイウヒトニオクルギョオカイハツノニュウモンショデス