# FT用データ生成スクリプト

In [40]:
# !conda install -y -c conda-forge kalpy \
# kaldi \
# pynini

# # パッケージインストール
# !pip install -r requirements.sbv.txt

In [41]:
# !pip list

In [42]:
# # mfa
# # 日本語辞書のダウンロード
# !mfa model download dictionary japanese_mfa

# # 日本語音響モデルのダウンロード
# !mfa model download acoustic japanese_mfa

## テキスト対話データ生成

In [43]:
import os
from typing import Literal
import ast

from dotenv import load_dotenv
from langchain_chroma import Chroma
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders import PDFMinerLoader
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import ChatOpenAI
from langchain_google_genai import ChatGoogleGenerativeAI


# .envファイル読み込み
load_dotenv("/users/s1f102201582/projects/mhcc-moshi/.env")

True

In [44]:
#config
from os.path import join, expanduser

OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
BASE_URL = "https://api.openai.iniad.org/api/v1"
MODEL='gemini-2.5-flash'
TEMPERATURE = 1.0
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,4"

# 生成する音声のサンプリングレート
setting_sr = 16000

#対話音声データの個数を指定
gen_dial_num = 5

# すでに作成した対話データを削除するかどうか
IS_REMOVE_EXIST_FILE = True

# ftに使うjsonとaudioの出力フォルダパス
home_dir = expanduser("~")
json_dir_path = join(home_dir, "projects/mhcc-moshi/moshi/data/v1/data_stereo")
audio_dir_path = join(home_dir, "projects/mhcc-moshi/moshi/data/v1/data_stereo")

# mfa関連のパス
model_dir = join(home_dir, "Documents/MFA/pretrained_models/acoustic/japanese_mfa.zip")
mfa_input_dir = join(home_dir, "projects/mhcc-moshi/moshi/data/v1/mfa_input")
mfa_output_dir = join(home_dir, "projects/mhcc-moshi/moshi/data/v1/mfa_output")

#RAGで読み取るPDFのパス
rag_pdf_dir = join(home_dir, "projects/mhcc-moshi/mental_docs/")

In [45]:
base_paths = [
    json_dir_path,
    audio_dir_path,
    mfa_input_dir,
    mfa_output_dir,
]

for p in base_paths:
    if not os.path.isdir(p):
        os.makedirs(p)

In [46]:
# model定義
model = ChatGoogleGenerativeAI(
                 model=MODEL,
                 temperature=TEMPERATURE)

# 埋め込みモデル定義
embeddings = OpenAIEmbeddings(
    openai_api_key=OPENAI_API_KEY,
    openai_api_base=BASE_URL,
    model="text-embedding-3-large"
)

# データベース定義
vector_store = Chroma(
    collection_name="collection",
    embedding_function=embeddings,
    # persist_directory = "/path/to/db_file" # if necessary
)

In [47]:
loader = DirectoryLoader(
    rag_pdf_dir,
    glob="*.pdf",
    show_progress=True,
    loader_cls=PDFMinerLoader,
)
docs = loader.load()
print(f"Loaded {len(docs)} documents")

  0%|                                                                                             | 0/3 [00:00<?, ?it/s]Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P1' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P1' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P1' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P1' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P1' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P1' is an invalid float value
Cann

Loaded 3 documents





In [48]:
# Debug
# for doc in docs:
#     print("-------------------------------------------------")
#     print(doc.metadata)
#     print(len(doc.page_content))
#     print(doc.page_content[:100])

In [49]:
#読み込んだ文章データをオーバーラップ200文字で1000文字づつ分割
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    add_start_index=True, # 分割前の文章のインデックスを追跡
)
splits = text_splitter.split_documents(docs)

# データベースにデータを追加
document_ids = vector_store.add_documents(documents=splits)

In [50]:
from langchain.agents.middleware import dynamic_prompt, ModelRequest

@dynamic_prompt
def prompt_with_context(request: ModelRequest) -> str:
    """Inject context into state messages."""
    last_query = request.state["messages"][-1].text
    retrieved_docs = vector_store.similarity_search(last_query, k=2)

    docs_content = "\n\n".join(doc.page_content for doc in retrieved_docs)

    system_message = (
        "You are a helpful assistant. Use the following context in your response:"
        f"\n\n{docs_content}"
    )

    return system_message

In [51]:
from typing import Literal

from pydantic import BaseModel, Field


class Dialogue(BaseModel):
    """対話データを構成する対話クラス"""
    speaker: Literal["A", "B"] = Field(..., description="話者。Aはカウンセラー、Bはクライエントを表す。")
    text: str = Field(..., description="話者が話した内容。")

class Dialogues(BaseModel):
    """カウンセリングを目的としたカウンセリング対話データ"""
    dialogues: list[Dialogue] = Field(..., description="対話データを構成する対話クラスのリスト。")

In [52]:
from langchain.agents import create_agent
from langchain.agents.structured_output import ToolStrategy

agent = create_agent(
    model, 
    tools=[],
    middleware=[prompt_with_context],
    response_format=ToolStrategy(
        Dialogues,
        handle_errors="フォーマットに合うように、もう一度対話データを生成してください。"
    )
)

In [53]:
#promptを作成
import random


sessions = [
    "【段階：初期】信頼関係を築きつつ、悩みの背景を深掘りするシーン",
    "【段階：中期】クライエントの「すべき思考」に焦点を当て、認知の歪みを扱うシーン",
    "【段階：終結期】これまでのセッションを振り返り、終結に向けて準備するシーン",
]

def gen_prompt_txt():
    choiced = random.randint(0, 2)
    choiced_session = sessions[choiced]
    prompt_txt = f"""メンタルヘルスケアカウンセリングのセッションをシミュレーションしてください。
シミュレーションしたい「段階」と「テーマ」:
{choiced_session}

役割定義:
A (カウンセラー): メンタルヘルスケアの専門知識を持つ経験豊富なカウンセラー。傾聴と共感の姿勢を基本とし、クライエントの言葉を促すように、優しく、自然な話し言葉（「〜ですね」「〜でしたか」など）を使います。
B (クライエント): 仕事上の悩みだけでなく、日常生活全般に対して漠然とした不安や焦りを感じている人物。

対話の要件:
スタイル: 実際の会話の文字起こしのように、堅苦しくない自然な「話し言葉」を使用してください。
相槌 (あいづち): カウンセラー（A）は、クライエント（B）の話を促し、共感を示すため、「ええ」「はい」「そうなんですね」「なるほど」といった細かな相槌を頻繁に、適切なタイミングで挿入してください。
構成: 会話が途中で途切れるのではなく、初回のヒアリングとして「一区切り」がつき、自然に終了する流れにしてください（例：次回の約束、今回のまとめなど）。
分量: 会話の往復は合計12〜20ターン程度、全体の文字数が合計500〜800文字程度になるように構成してください。
"""
    return prompt_txt

In [54]:
import time
from google.api_core.exceptions import ResourceExhausted

max_retries = 5
base_wait_time = 1 # minutes

# テキスト対話生成関数
def gen_txt_dialogue():
    prompt = gen_prompt_txt()

    # レート制限に引っかかることがあるため、例外処理
    for i in range(1, max_retries+1):
        try:
            resp = agent.invoke({"messages": [{"role": "user", "content": prompt}]})
            break
        except ResourceExhausted as e:
            if i < max_retries - 1:
                wait_time = (base_wait_time ** i) * 60
                time.sleep(wait_time)
            # max_retries回失敗した場合はエラーを起こす
            else:
                raise e
        except Exception as e:
            raise e

    dialogues_list = resp["structured_response"].dialogues
    return dialogues_list

In [55]:
#DEBUG
# txt_dialogue = gen_txt_dialogue()
# print(txt_dialogue)
# lst_dialogue = txt_to_lst(txt_dialogue)
# print(lst_dialogue)

## テキスト対話データを音声対話データに変換 

In [56]:
from style_bert_vits2.nlp import bert_models
from style_bert_vits2.constants import Languages
from pathlib import Path
from huggingface_hub import hf_hub_download
from style_bert_vits2.tts_model import TTSModel

bert_models.load_model(Languages.JP, "ku-nlp/deberta-v2-large-japanese-char-wwm")
bert_models.load_tokenizer(Languages.JP, "ku-nlp/deberta-v2-large-japanese-char-wwm")
assets_root = Path("model_assets")

# # 子春音あみ
# model_file = "koharune-ami/koharune-ami.safetensors"
# config_file = "koharune-ami/config.json"
# style_file = "koharune-ami/style_vectors.npy"
# hf_repo = "litagin/sbv2_koharune_ami"

# # あみたろ
# model_file = "amitaro/amitaro.safetensors"
# config_file = "amitaro/config.json"
# style_file = "amitaro/style_vectors.npy"
# hf_repo = "litagin/sbv2_amitaro"


# デフォルトの女性2
model_file = "jvnv-F2-jp/jvnv-F2_e166_s20000.safetensors"
config_file = "jvnv-F2-jp/config.json"
style_file = "jvnv-F2-jp/style_vectors.npy"
hf_repo = "litagin/style_bert_vits2_jvnv"

for file in [model_file, config_file, style_file]:
    print(file)
    hf_hub_download(hf_repo, file, local_dir="model_assets")

A_model = TTSModel(
    model_path=assets_root / model_file,
    config_path=assets_root / config_file,
    style_vec_path=assets_root / style_file,
    device="cuda",
)

# デフォルトの男性2
model_file = "jvnv-M2-jp/jvnv-M2-jp_e159_s17000.safetensors"
config_file = "jvnv-M2-jp/config.json"
style_file = "jvnv-M2-jp/style_vectors.npy"

for file in [model_file, config_file, style_file]:
    print(file)
    hf_hub_download(hf_repo, file, local_dir="model_assets")

B_model = TTSModel(
    model_path=assets_root / model_file,
    config_path=assets_root / config_file,
    style_vec_path=assets_root / style_file,
    device="cuda",
)

jvnv-F2-jp/jvnv-F2_e166_s20000.safetensors
jvnv-F2-jp/config.json
jvnv-F2-jp/style_vectors.npy
jvnv-M2-jp/jvnv-M2-jp_e159_s17000.safetensors
jvnv-M2-jp/config.json
jvnv-M2-jp/style_vectors.npy


In [57]:
def build_audio_synth_prompt(text_dialogue_list):
    resp = ""
    resp_header =  """あなたがこれから音声合成するテキストは以下の対話内容のワンフレーズです。
この対話の文脈に合うように音声合成してください。

<対話内容の全文>"""
    resp += resp_header
    for text_dial in text_dialogue_list:
        resp += f"\n{text_dial.speaker}: {text_dial.text}"
    return resp

In [58]:
from typing import Literal

def sbv_tts(text: str, speaker: Literal["A", "B"], assist_text=None):
    if speaker == "A":
        sr, audio = A_model.infer(
            text = text,
            style='Happy',
            style_weight=1,
            split_interval = 0.3,
            use_assist_text = True if assist_text is not None else None,
            assist_text = assist_text
        )
    else:
        sr, audio = B_model.infer(
            text = text,
            style='Sad',
            style_weight=1,
            split_interval = 0.3,
            use_assist_text = True if assist_text is not None else None,
            assist_text = assist_text
        )
    
    return sr, audio

In [59]:
import librosa
import numpy as np

def gen_audio_dialogue(text_dialogue_list, prompt):
    # 音声ファイルを順番に生成（ファイルは不要なのでwave配列で持つ）
    wav_data = []
    for dial in text_dialogue_list:
        speaker = dial.speaker
        sr, wav = sbv_tts(dial.text, speaker, prompt)

        # サンプリングレートを変換
        if sr != setting_sr:
            # 16ビット整数のデータを、-1.0から1.0の範囲に収まる浮動小数点数に正規化
            wav = wav.astype(np.float32) / 32768.0
            wav = librosa.resample(wav, orig_sr=sr, target_sr=setting_sr)

        # 0.3秒間の無音時間を追加
        duration_sec = 0.3
        num_silent_samples = int(setting_sr*duration_sec)
        silence = np.zeros(num_silent_samples, dtype=wav.dtype)
        wav_with_silence = np.concatenate((wav, silence))
        wav_data.append(wav_with_silence)
    
    # 最終的な音声長を決定
    max_len = sum([len(w) for w in wav_data])
    
    # ステレオ音声用（2チャンネル×最大長）の空配列をゼロ初期化で作成
    stereo = np.zeros((2, max_len), dtype=np.float32)
    
    pos = 0
    for i, wav in enumerate(wav_data):
        ch = i%2  # 0:左(A), 1:右(B)
        stereo[ch, pos:pos+len(wav)] += wav
        pos += len(wav)
    
    # 転置(-1,2)する
    stereo = stereo.T
    return stereo

## mfa(montreal force alignment)による音声アラインメント

In [60]:
import copy

def correct_json(full_text, align_json):
    new_align_json = copy.deepcopy(align_json)
    segments = new_align_json["tiers"]["words"]["entries"]
    checked_len = 0
    prev_checked_len = 0
    i = 0
    while i < len(segments):
        if re.search(f"^<unk>|<sil>$", segments[i][2]):
            if i == 0:
                if re.search(f"^<unk>|<sil>$", segments[i+1][2]):
                    end_time = 0
                    while re.search(f"^<unk>|<sil>$", segments[i+1][2]):
                        end_time = segments[i+1][1]
                        segments.pop(i+1)
                    segments[i][1] = end_time
                
                m = re.search(f"^(.+?){segments[i+1][2]}", full_text[checked_len:])
                match_text = m.groups()
                segments[i][2] = match_text[0]
            elif i == len(segments)-1:
                m = re.search(f"{segments[i-1][2]}(.+?)$", full_text[checked_len:])
                match_text = m.groups()
                segments[i][2] = match_text[0]
            else:
                if re.search(f"^<unk>|<sil>$", segments[i+1][2]):
                    end_time = 0
                    while re.search(f"^<unk>|<sil>$", segments[i+1][2]):
                        end_time = segments[i+1][1]
                        segments.pop(i+1)
                    segments[i][1] = end_time
                m = re.search(f"^{segments[i-1][2]}(.+?){segments[i+1][2]}", full_text[prev_checked_len:])
                match_text = m.groups()
                segments[i][2] = match_text[0]
        else:
            if re.search(f"^([。、,.!?！？…「」]){segments[i][2]}.*$", full_text[checked_len:]):
                m = re.search(f"^([。、,.!?！？…「」]){segments[i][2]}.*$", full_text[checked_len:])
                match_punc = m.groups()
                segments[i][2] = match_punc[0] + segments[i][2]
            elif re.search(f"^{segments[i][2]}([。、,.!?！？…「」]).*$", full_text[checked_len:]):
                m = re.search(f"^{segments[i][2]}([。、,.!?！？…「」]).*$", full_text[checked_len:])
                match_punc = m.groups()
                segments[i][2] = segments[i][2] + match_punc[0]
                
        prev_checked_len = checked_len
        checked_len += len(segments[i][2])
        i += 1
    return new_align_json

In [61]:
from os.path import join, expanduser
import subprocess
import json
import re
import shutil

def alignment_channel(channel, target_dir_name):
    input_dir_path = join(mfa_input_dir, target_dir_name)
    output_dir_path = join(mfa_output_dir, target_dir_name)
    os.makedirs(input_dir_path, exist_ok=True)
    os.makedirs(output_dir_path, exist_ok=True)

    subprocess.run([
        "mfa",
        "align",
        input_dir_path,
        "japanese_mfa",
        model_dir,
        output_dir_path,
        "--quiet",
        "--overwrite",
        "--clean",
        "--final_clean",
        "--output_format", "json",
        "--beam", "1000",
        "--retry_beam", "4000",
    ])      

def parse_ft_json(json_data):
    result = {"alignments": []}

    segments = json_data["tiers"]["words"]["entries"]
    for segment in segments:
        result["alignments"].append([
            segment[2],
            [segment[0], segment[1]],
            "SPEAKER_MAIN"
        ])
    result["alignments"].sort(key=lambda x: x[1][0])
    return result

def alignment_audio_dialogue(text_dialogue_list, audio_path, idx):
    json_list = []
    audio, sr = sf.read(audio_path)
    
    result = ""
    target_dir_name = str(idx)
    target_dir = os.path.join(mfa_input_dir, target_dir_name)
    if not os.path.isdir(target_dir):
        os.makedirs(target_dir)

    target_text_file = os.path.join(target_dir, f"{idx}.txt")

    oneline_text = ""
    for dial in text_dialogue_list:
        result += dial.text + "\n"
        oneline_text += dial.text
    with open(target_text_file, "w") as f:
        f.write(result)

    wav_name = f"{idx}.wav"
    src_wav_path = os.path.join(audio_dir_path, wav_name)
    dist_wav_path = os.path.join(target_dir, wav_name)
    shutil.copy(src_wav_path, dist_wav_path)

    alignment_channel(audio, target_dir_name)
    json_path = os.path.join(mfa_output_dir, target_dir_name, f"{idx}.json")
    json_data = ""
    with open(json_path, "r") as f:
        json_data = json.load(f)

    try:
        correct_json_data = correct_json(oneline_text, json_data)
        ft_json = parse_ft_json(correct_json_data)
    except:
        print(f"jsonファイル {idx}.json の訂正に失敗しました。")
        ft_json= parse_ft_json(json_data)
    
    return ft_json

## フォルダ初期化

In [62]:
import re

def get_file_name():
    wav_file_pattern = r"^(\d+)\.wav$"
    num = -1
    for file in os.listdir(audio_dir_path):
        print(file)
        if not os.path.exists(os.path.join(audio_dir_path, file)):
            continue
        if not re.match(wav_file_pattern, file):
            continue

        match_obj = re.match(wav_file_pattern, file)
        get_number = int(match_obj.groups()[0])
        print(get_number)
        if num < get_number:
            num = get_number
    return num

In [63]:
from glob import glob
import shutil

def delete_files(dir_path):
    shutil.rmtree(dir_path)
    os.makedirs(dir_path)

if IS_REMOVE_EXIST_FILE:
    file_name_num = -1
    for dir_path in base_paths:
        delete_files(dir_path)
else:
    file_name_num = get_file_name()

## メイン処理

In [64]:
%%time

import soundfile as sf
import json

for i in range(file_name_num+1, gen_dial_num+file_name_num+1):

    # テキスト生成
    txt_dialogue_list = gen_txt_dialogue()

    # 音声合成のためのプロンプト生成
    audio_synth_prompt = build_audio_synth_prompt(txt_dialogue_list)

    # 対話テキストを音声合成
    stereo = gen_audio_dialogue(txt_dialogue_list, audio_synth_prompt)
    
    wav_name = f"{i}.wav"
    audio_file_path = os.path.join(audio_dir_path, wav_name)

    # wavファイル出力
    sf.write(audio_file_path, stereo, setting_sr)

    # 音声アラインメント
    json_data = alignment_audio_dialogue(txt_dialogue_list, audio_file_path, i)

    json_name = f"{i}.json"
    json_file_path = os.path.join(json_dir_path, json_name)
    
    # JSON出力
    with open(json_file_path, 'w', encoding='utf-8') as f:
        json.dump(json_data, f, ensure_ascii=False, indent=2)

[32m11-04 23:49:05[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは、今日はどんなことでお話しに来られましたか？
[32m11-04 23:49:05[0m |[1m  INFO  [0m| infer.py:24 | Using JP-Extra model


  WeightNorm.apply(module, name, dim)


[32m11-04 23:49:06[0m |[1m  INFO  [0m| safetensors.py:50 | Loaded 'model_assets/jvnv-F2-jp/jvnv-F2_e166_s20000.safetensors' (iteration 166)
[32m11-04 23:49:06[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
[32m11-04 23:49:06[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは。えっと、最近、仕事のことがずっと頭から離れなくて、なんだかいつも焦っているような、漠然とした不安があるんです。特に何が、というわけでもないんですけど…。
[32m11-04 23:49:06[0m |[1m  INFO  [0m| infer.py:24 | Using JP-Extra model
[32m11-04 23:49:07[0m |[1m  INFO  [0m| safetensors.py:50 | Loaded 'model_assets/jvnv-M2-jp/jvnv-M2-jp_e159_s17000.safetensors' (iteration 159)
[32m11-04 23:49:07[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
[32m11-04 23:49:07[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そうですか。お仕事のことで、常に焦りや不安を感じていらっしゃるんですね。はい。それは、いつ頃から感じ始めたことでしたか？
[32m11-04 23:49:08[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successful

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   
[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m

[32m11-04 23:53:29[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは、Bさん。今日はどんなことをお話ししましょうか？
[32m11-04 23:53:29[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
[32m11-04 23:53:29[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは、先生。最近、色々と「こうするべきだ」って考えてしまって、すごく疲れるんです。
[32m11-04 23:53:29[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
[32m11-04 23:53:29[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
「こうするべきだ」ですか。ええ。具体的にどんな時にそう感じますか？




[32m11-04 23:53:30[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
[32m11-04 23:53:30[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
仕事で新しいプロジェクトを任された時も、「完璧にこなすべきだ」って思って、夜遅くまで資料を作ったりしてしまうんです。
[32m11-04 23:53:30[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
[32m11-04 23:53:30[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
完璧にこなすべき、と。はい。そうなんですね。そう思うと、どんな気持ちになりますか？
[32m11-04 23:53:30[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
[32m11-04 23:53:30[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
プレッシャーがすごくて、失敗したらどうしようって不安で、全然リラックスできないんです。
[32m11-04 23:53:30[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
[32m11-04 23:53:30[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
不安になる、と。なるほど。その「完璧にこなすべきだ」という考えが、Bさんにとってどんな影響を与えていると思いますか？
[32m11-

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   
[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m

CPU times: user 1min 13s, sys: 3.02 s, total: 1min 16s
Wall time: 8min 55s



KeyboardInterrupt

