# FT用データ生成スクリプト

In [1]:
# !conda install -y -c conda-forge kalpy \
# kaldi \
# pynini

# # パッケージインストール
# !pip install -r requirements.sbv.txt

In [2]:
# !pip list

In [3]:
# # mfa
# # 日本語辞書のダウンロード
# !mfa model download dictionary japanese_mfa

# # 日本語音響モデルのダウンロード
# !mfa model download acoustic japanese_mfa

## テキスト対話データ生成

In [4]:
import os
from typing import Literal
import ast

from dotenv import load_dotenv
from langchain_chroma import Chroma
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders import PDFMinerLoader
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import ChatOpenAI
from langchain_google_genai import ChatGoogleGenerativeAI


# .envファイル読み込み
load_dotenv()

True

In [5]:
#config
from os.path import join, expanduser

OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
BASE_URL = "https://api.openai.iniad.org/api/v1"
MODEL='gemini-2.5-flash'
TEMPERATURE = 1.0
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"

# 生成する音声のサンプリングレート
setting_sr = 16000

#対話音声データの個数を指定
gen_dial_num = 100

# すでに作成した対話データを削除するかどうか
IS_REMOVE_EXIST_FILE = True

# ftに使うjsonとaudioの出力フォルダパス
home_dir = expanduser("~")
json_dir_path = join(home_dir, "Github/jmoshi-ft/gen_dialogue/data/sbv/transcription")
audio_dir_path = join(home_dir, "Github/jmoshi-ft/gen_dialogue/data/sbv/audio")

# mfa関連のパス
model_dir = join(home_dir, "Documents/MFA/pretrained_models/acoustic/japanese_mfa.zip")
mfa_input_dir = join(home_dir, "Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_input")
mfa_output_dir = join(home_dir, "Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output")

In [6]:
base_paths = [
    json_dir_path,
    audio_dir_path,
    mfa_input_dir,
    mfa_output_dir,
]

for p in base_paths:
    if not os.path.isdir(p):
        os.makedirs(p)

In [7]:
# model定義
model = ChatGoogleGenerativeAI(
                 model=MODEL,
                 temperature=TEMPERATURE)

# 埋め込みモデル定義
embeddings = OpenAIEmbeddings(
    openai_api_key=OPENAI_API_KEY,
    openai_api_base=BASE_URL,
    model="text-embedding-3-large"
)

# データベース定義
vector_store = Chroma(
    collection_name="collection",
    embedding_function=embeddings,
    # persist_directory = "/path/to/db_file" # if necessary
)

In [8]:
loader = DirectoryLoader(
    "../../mental_docs/",
    glob="*.pdf",
    show_progress=True,
    loader_cls=PDFMinerLoader,
)
docs = loader.load()
print(f"Loaded {len(docs)} documents")

  0%|                                                                                     | 0/3 [00:00<?, ?it/s]Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P1' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P1' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P1' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P1' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P1' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P1' is an invalid float value
Cannot set g

Loaded 3 documents





In [9]:
# Debug
# for doc in docs:
#     print("-------------------------------------------------")
#     print(doc.metadata)
#     print(len(doc.page_content))
#     print(doc.page_content[:100])

In [10]:
#読み込んだ文章データをオーバーラップ200文字で1000文字づつ分割
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    add_start_index=True, # 分割前の文章のインデックスを追跡
)
splits = text_splitter.split_documents(docs)

# データベースにデータを追加
document_ids = vector_store.add_documents(documents=splits)

In [11]:
from langchain.agents.middleware import dynamic_prompt, ModelRequest

@dynamic_prompt
def prompt_with_context(request: ModelRequest) -> str:
    """Inject context into state messages."""
    last_query = request.state["messages"][-1].text
    retrieved_docs = vector_store.similarity_search(last_query, k=2)

    docs_content = "\n\n".join(doc.page_content for doc in retrieved_docs)

    system_message = (
        "You are a helpful assistant. Use the following context in your response:"
        f"\n\n{docs_content}"
    )

    return system_message

In [12]:
from typing import Literal

from pydantic import BaseModel, Field


class Dialogue(BaseModel):
    """対話データを構成する対話クラス"""
    speaker: Literal["A", "B"] = Field(..., description="話者。Aはカウンセラー、Bはクライエントを表す。")
    text: str = Field(..., description="話者が話した内容。")

class Dialogues(BaseModel):
    """カウンセリングを目的としたカウンセリング対話データ"""
    dialogues: list[Dialogue] = Field(..., description="対話データを構成する対話クラスのリスト。")

In [13]:
from langchain.agents import create_agent
from langchain.agents.structured_output import ToolStrategy

agent = create_agent(
    model, 
    tools=[],
    middleware=[prompt_with_context],
    response_format=ToolStrategy(
        Dialogues,
        handle_errors="フォーマットに合うように、もう一度対話データを生成してください。"
    )
)

In [14]:
#promptを作成
import random


sessions = [
    "【段階：初期】信頼関係を築きつつ、悩みの背景を深掘りするシーン",
    "【段階：中期】クライエントの「すべき思考」に焦点を当て、認知の歪みを扱うシーン",
    "【段階：終結期】これまでのセッションを振り返り、終結に向けて準備するシーン",
]

def gen_prompt_txt():
    choiced = random.randint(0, 2)
    choiced_session = sessions[choiced]
    prompt_txt = f"""メンタルヘルスケアカウンセリングのセッションをシミュレーションしてください。
シミュレーションしたい「段階」と「テーマ」:
{choiced_session}

役割定義:
A (カウンセラー): メンタルヘルスケアの専門知識を持つ経験豊富なカウンセラー。傾聴と共感の姿勢を基本とし、クライエントの言葉を促すように、優しく、自然な話し言葉（「〜ですね」「〜でしたか」など）を使います。
B (クライエント): 仕事上の悩みだけでなく、日常生活全般に対して漠然とした不安や焦りを感じている人物。

対話の要件:
スタイル: 実際の会話の文字起こしのように、堅苦しくない自然な「話し言葉」を使用してください。
相槌 (あいづち): カウンセラー（A）は、クライエント（B）の話を促し、共感を示すため、「ええ」「はい」「そうなんですね」「なるほど」といった細かな相槌を頻繁に、適切なタイミングで挿入してください。
構成: 会話が途中で途切れるのではなく、初回のヒアリングとして「一区切り」がつき、自然に終了する流れにしてください（例：次回の約束、今回のまとめなど）。
分量: 会話の往復は合計12〜20ターン程度、全体の文字数が合計500〜800文字程度になるように構成してください。
"""
    return prompt_txt

In [15]:
# テキスト対話生成関数
def gen_txt_dialogue():
    prompt = gen_prompt_txt()
    resp = agent.invoke({"messages": [{"role": "user", "content": prompt}]})
    dialogues_list = resp["structured_response"].dialogues
    return dialogues_list

In [16]:
#DEBUG
# txt_dialogue = gen_txt_dialogue()
# print(txt_dialogue)
# lst_dialogue = txt_to_lst(txt_dialogue)
# print(lst_dialogue)

## テキスト対話データを音声対話データに変換 

In [17]:
from style_bert_vits2.nlp import bert_models
from style_bert_vits2.constants import Languages
from pathlib import Path
from huggingface_hub import hf_hub_download
from style_bert_vits2.tts_model import TTSModel

bert_models.load_model(Languages.JP, "ku-nlp/deberta-v2-large-japanese-char-wwm")
bert_models.load_tokenizer(Languages.JP, "ku-nlp/deberta-v2-large-japanese-char-wwm")
assets_root = Path("model_assets")

# # 子春音あみ
# model_file = "koharune-ami/koharune-ami.safetensors"
# config_file = "koharune-ami/config.json"
# style_file = "koharune-ami/style_vectors.npy"
# hf_repo = "litagin/sbv2_koharune_ami"

# # あみたろ
# model_file = "amitaro/amitaro.safetensors"
# config_file = "amitaro/config.json"
# style_file = "amitaro/style_vectors.npy"
# hf_repo = "litagin/sbv2_amitaro"


# デフォルトの女性2
model_file = "jvnv-F2-jp/jvnv-F2_e166_s20000.safetensors"
config_file = "jvnv-F2-jp/config.json"
style_file = "jvnv-F2-jp/style_vectors.npy"
hf_repo = "litagin/style_bert_vits2_jvnv"

for file in [model_file, config_file, style_file]:
    print(file)
    hf_hub_download(hf_repo, file, local_dir="model_assets")

A_model = TTSModel(
    model_path=assets_root / model_file,
    config_path=assets_root / config_file,
    style_vec_path=assets_root / style_file,
    device="cuda",
)

# デフォルトの男性2
model_file = "jvnv-M2-jp/jvnv-M2-jp_e159_s17000.safetensors"
config_file = "jvnv-M2-jp/config.json"
style_file = "jvnv-M2-jp/style_vectors.npy"

for file in [model_file, config_file, style_file]:
    print(file)
    hf_hub_download(hf_repo, file, local_dir="model_assets")

B_model = TTSModel(
    model_path=assets_root / model_file,
    config_path=assets_root / config_file,
    style_vec_path=assets_root / style_file,
    device="cuda",
)

[32m10-25 18:00:24[0m |[1m  INFO  [0m| bert_models.py:92 | Loaded the Languages.JP BERT model from ku-nlp/deberta-v2-large-japanese-char-wwm
[32m10-25 18:00:25[0m |[1m  INFO  [0m| bert_models.py:154 | Loaded the Languages.JP BERT tokenizer from ku-nlp/deberta-v2-large-japanese-char-wwm
jvnv-F2-jp/jvnv-F2_e166_s20000.safetensors
jvnv-F2-jp/config.json
jvnv-F2-jp/style_vectors.npy
jvnv-M2-jp/jvnv-M2-jp_e159_s17000.safetensors
jvnv-M2-jp/config.json
jvnv-M2-jp/style_vectors.npy


In [18]:
from typing import Literal

def sbv_tts(text: str, speaker: Literal["A", "B"], assist_text=None):
    if speaker == "A":
        sr, audio = A_model.infer(
            text = text,
            style='Happy',
            style_weight=1,
            split_interval = 0.3,
            use_assist_text = True if assist_text is not None else None,
            assist_text = assist_text
        )
    else:
        sr, audio = B_model.infer(
            text = text,
            style='Sad',
            style_weight=1,
            split_interval = 0.3,
            use_assist_text = True if assist_text is not None else None,
            assist_text = assist_text
        )
    
    return sr, audio

In [19]:
import librosa
import numpy as np

def gen_audio_dialogue(text_dialogue_list):
    # 音声ファイルを順番に生成（ファイルは不要なのでwave配列で持つ）
    wav_data = []
    for dial in text_dialogue_list:
        speaker = dial.speaker
        sr, wav = sbv_tts(dial.text, speaker)
        print(wav.shape)

        # サンプリングレートを変換
        if sr != setting_sr:
            # 16ビット整数のデータを、-1.0から1.0の範囲に収まる浮動小数点数に正規化
            wav = wav.astype(np.float32) / 32768.0
            wav = librosa.resample(wav, orig_sr=sr, target_sr=setting_sr)

        # 0.3秒間の無音時間を追加
        duration_sec = 0.3
        num_silent_samples = int(setting_sr*duration_sec)
        silence = np.zeros(num_silent_samples, dtype=wav.dtype)
        wav_with_silence = np.concatenate((wav, silence))
        wav_data.append(wav_with_silence)
    
    # 最終的な音声長を決定
    max_len = sum([len(w) for w in wav_data])
    
    # ステレオ音声用（2チャンネル×最大長）の空配列をゼロ初期化で作成
    stereo = np.zeros((2, max_len), dtype=np.float32)
    
    pos = 0
    for i, wav in enumerate(wav_data):
        ch = i%2  # 0:左(A), 1:右(B)
        stereo[ch, pos:pos+len(wav)] += wav
        pos += len(wav)
    
    # 転置(-1,2)する
    stereo = stereo.T
    return stereo

## mfa(montreal force alignment)による音声アラインメント

In [20]:
import MeCab
import re

# 句読点のパターン
PUNCT_RE = re.compile(r'^[。、,.!?！？…]+$')

def tokenize_text(text, is_punct_isolated=False):
    tokens = []
    punct_dict = {}
    checked_punct_pos = 0
    try:
        # MeCabのタガーを初期化
        tagger = MeCab.Tagger()

        # MeCabは内部でShift-JISやEUC-JPを期待することがあるため、
        # UnicodeDecodeErrorを避けるために明示的にUTF-8でエンコード・デコードする
        # parseToNodeは、より詳細な情報をノードオブジェクトとして取得できるメソッド
        node = tagger.parseToNode(text)
        while node:
            if not node.surface:
                pass
                
            elif not is_punct_isolated and PUNCT_RE.match(node.surface) and tokens:
                punct_dict[checked_punct_pos] = node.surface
                
                checked_punct_pos += len(node.surface)
                # 句読点なら直前トークンに連結
                tokens[-1] += node.surface
            else:
                checked_punct_pos += len(node.surface)
                
                # 通常トークンはそのまま追加
                tokens.append(node.surface)
            node = node.next
    except RuntimeError as e:
        print(f"MeCabの実行中にエラーが発生しました: {e}", file=sys.stderr)
        
    return tokens, punct_dict

In [21]:
def generate_txt_file_using_mecab(input_txt, path):
    tokens, punct_dict = tokenize_text(input_txt)
    output = ""
    for token in tokens:
        output += token + "\n"
        
    with open(path, "w", encoding="utf-8") as f:
        f.write(output)
    return tokens, punct_dict

In [22]:
from os.path import join, expanduser
import subprocess
import json

def alignment_channel(channel, txt, target_dir_name):
    input_dir_path = join(mfa_input_dir, target_dir_name)
    output_dir_path = join(mfa_output_dir, target_dir_name)
    os.makedirs(input_dir_path, exist_ok=True)
    os.makedirs(output_dir_path, exist_ok=True)
    
    for_align_audio_path = join(input_dir_path, f"{target_dir_name}.wav")
    for_align_txt_path = join(input_dir_path, f"{target_dir_name}.txt")

    sf.write(for_align_audio_path, channel, setting_sr)
    _, punct_dict = generate_txt_file_using_mecab(txt, for_align_txt_path)
    subprocess.run([
        "mfa",
        "align",
        input_dir_path,
        "japanese_mfa",
        model_dir,
        output_dir_path,
        "--verbose",
        "--override",
        "--clean",
        "--output_format", "json",
        "--use_mp",
        "--beam", "1000",
        "--retry_beam", "4000",
        "--punctuation", "…",
    ])
    return punct_dict

def json_formatter_for_ft(align_json_A, align_json_B):
    json = []

    segments_A = align_json_A["tiers"]["words"]["entries"]
    segments_B = align_json_B["tiers"]["words"]["entries"]
    for segment in segments_A:
        json.append({
            "speaker": "A",
            "word": segment[2],
            "start": segment[0],
            "end": segment[1],
        })
    for segment in segments_B:
        json.append({
            "speaker": "B",
            "word": segment[2],
            "start": segment[0],
            "end": segment[1],
        })
    sorted_json = sorted(json, key=lambda seg: seg["start"])
    return sorted_json

def lst_to_line_str(lst):
    result = ""
    for s in lst:
        result += s
    return result
    
def alignment_audio_dialogue(text_dialogue_list, audio_path, idx):
    # ステレオ分離: speaker A=左(0), B=右(1)と仮定
    audio, sr = sf.read(audio_path)    # (samples, channels)
    channel_A = audio[:,0]
    channel_B = audio[:,1]
    txt_lst_A = []
    txt_lst_B = []
    for txt_dial in text_dialogue_list:
        if txt_dial.speaker == "A":
            txt_lst_A.append(txt_dial.text)
        else:
            txt_lst_B.append(txt_dial.text)
    A_full_txt = lst_to_line_str(txt_lst_A)
    B_full_txt = lst_to_line_str(txt_lst_B)
    
    target_dir_name_A = f"A_{idx}"
    target_dir_name_B = f"B_{idx}"
    punct_dict_A = alignment_channel(channel_A, A_full_txt, target_dir_name_A)
    punct_dict_B = alignment_channel(channel_B, B_full_txt, target_dir_name_B)
    json_path_A = join(mfa_output_dir, target_dir_name_A, f"{target_dir_name_A}.json")
    json_path_B = join(mfa_output_dir, target_dir_name_B, f"{target_dir_name_B}.json")
    with open(json_path_A, "r") as f:
        json_A = json.load(f)
    with open(json_path_B, "r") as f:
        json_B = json.load(f)
    
    ft_json = json_formatter_for_ft(json_A, json_B)

    return ft_json

## フォルダ初期化

In [23]:
import re

def get_file_name():
    wav_file_pattern = r"^(\d+)\.wav$"
    num = -1
    for file in os.listdir(audio_dir_path):
        if not os.path.exists(os.path.join(audio_dir_path, file)):
            continue
        if not re.match(wav_file_pattern, file):
            continue
    
        match_obj = re.match(wav_file_pattern, file)
        get_number = int(match_obj.groups()[0])
    
        if num < get_number:
            num = get_number
    return num

In [24]:
from glob import glob
import shutil

def delete_files(dir_path):
    shutil.rmtree(dir_path)
    os.makedirs(dir_path)

if IS_REMOVE_EXIST_FILE:
    file_name_num = -1
    for dir_path in base_paths:
        delete_files(dir_path)
else:
    file_name_num = get_file_name()

## メイン処理

In [25]:
%%time

import soundfile as sf
import json

for i in range(file_name_num+1, gen_dial_num+file_name_num+1):

    txt_dialogue_list = gen_txt_dialogue()
    stereo = gen_audio_dialogue(txt_dialogue_list)
    
    wav_name = f"{i}.wav"
    audio_file_path = os.path.join(audio_dir_path, wav_name)

    # wavファイル出力
    sf.write(audio_file_path, stereo, setting_sr)

    json_data = alignment_audio_dialogue(txt_dialogue_list, audio_file_path, i)

    json_name = f"{i}.json"
    json_file_path = os.path.join(json_dir_path, json_name)
    
    # JSON出力
    with open(json_file_path, 'w', encoding='utf-8') as f:
        json.dump(json_data, f, ensure_ascii=False, indent=2)

[32m10-25 18:00:43[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは。Bさん、今日はどんな感じですか？前回の後、何か気づいたことなどありましたか？
[32m10-25 18:00:43[0m |[1m  INFO  [0m| infer.py:24 | Using JP-Extra model


  WeightNorm.apply(module, name, dim)


[32m10-25 18:00:44[0m |[1m  INFO  [0m| safetensors.py:50 | Loaded 'model_assets/jvnv-F2-jp/jvnv-F2_e166_s20000.safetensors' (iteration 166)
Downloading: "https://github.com/r9y9/open_jtalk/releases/download/v1.11.1/open_jtalk_dic_utf_8-1.11.tar.gz"


  import pkg_resources


dic.tar.gz: 0.00B [00:00, ?B/s]

Extracting tar file /home1/s1f102201582/anaconda3/envs/sbv-tts/lib/python3.12/site-packages/pyopenjtalk/dic.tar.gz
[32m10-25 18:00:46[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(318976,)
[32m10-25 18:00:54[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは。そうですね…最近、やることがたくさんあるのに、全然うまくいかなくて。もっとちゃんと「すべき」なのに、と思ってしまうんです。
[32m10-25 18:00:54[0m |[1m  INFO  [0m| infer.py:24 | Using JP-Extra model


  WeightNorm.apply(module, name, dim)


[32m10-25 18:00:54[0m |[1m  INFO  [0m| safetensors.py:50 | Loaded 'model_assets/jvnv-M2-jp/jvnv-M2-jp_e159_s17000.safetensors' (iteration 159)
[32m10-25 18:00:55[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(442368,)
[32m10-25 18:00:55[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
ええ、なるほど。「もっとちゃんとすべき」と感じていらっしゃるのですね。はい。具体的には、どんな時にそう思われますか？
[32m10-25 18:00:55[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(399872,)
[32m10-25 18:00:55[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
例えば、仕事でもっと効率的に動く「べき」とか、家事も完璧にこなす「べき」とか…いつも「こうあるべきだ」って考えてしまって、疲れてしまいます。
[32m10-25 18:00:55[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(467968,)
[32m10-25 18:00:55[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そうなんですね。たくさんの「〜べき」に縛られて、しんどく感じていらっしゃるのですね。
[32m10-25 18:00:55[0m |[1m  INFO  [0m| tts_model.py:324



[32m10-25 18:00:55[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(75264,)
[32m10-25 18:00:55[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
その「〜べき」というのは、Bさんにとって、どんな意味を持つのでしょう？誰かに言われたことでしょうか、それとも、ご自身でそう思い込んでいることでしょうか。
[32m10-25 18:00:55[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(467456,)
[32m10-25 18:00:55[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
うーん…誰かに言われたわけじゃないんですが、なんだか「そうあるべき」だって、ずっと自分に言い聞かせているような気がします。




[32m10-25 18:00:55[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(349184,)
[32m10-25 18:00:56[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
なるほど。ご自身の中で、自然と「こうしなければ」という思いが強くなっているのかもしれませんね。そういった「〜すべき」という考え方は、時にBさんを苦しめてしまうこともあるのかもしれません。
[32m10-25 18:00:56[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(617472,)
[32m10-25 18:00:56[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
確かに、そうですね。それを考えるだけで、また気分が重くなってきてしまいます。




[32m10-25 18:00:56[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(260096,)
[32m10-25 18:00:56[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
ええ。今日はこの「〜すべき」という考え方、私たちは「すべき思考」と呼んだりするんですが、それがBさんの心にどんな影響を与えているのか、少し見ていきたいなと思います。いかがでしょうか？
[32m10-25 18:00:56[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(611840,)
[32m10-25 18:00:56[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、お願いします。




[32m10-25 18:00:56[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(79360,)
[32m10-25 18:00:56[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
ありがとうございます。では、具体的に、最近感じた「〜すべき」をいくつか書き出してみるところから始めてみましょうか。これは次回の宿題にも繋がりますしね。
[32m10-25 18:00:56[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(499200,)
[32m10-25 18:00:56[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
わかりました。やってみます。




[32m10-25 18:00:56[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(101888,)
[32m10-25 18:00:56[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
ええ。今日は、Bさんの「すべき思考」について、少しお話しができてよかったです。次回までに、日常生活で「〜すべき」と感じたことを、ぜひメモしてみてください。また来週、お話し聞かせてくださいね。
[32m10-25 18:00:56[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(660480,)
[32m10-25 18:00:56[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、ありがとうございます。また来週お願いします。




[32m10-25 18:00:56[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(188416,)


[2;36m [0m[36mDEBUG   [0m Beginning run for A_0                                                 
[2;36m [0m[36mDEBUG   [0m Using [32m"global"[0m profile                                                
[2;36m [0m[36mDEBUG   [0m Using multiprocessing with [1;36m3[0m                                          
[2;36m [0m[36mDEBUG   [0m Set up logger for MFA version: [1;36m3.3[0m.[1;36m7[0m                                  
[2;36m [0m[36mDEBUG   [0m Cleaned previous run                                                  
[2;36m [0m[36mDEBUG   [0m There were some differences in the current run compared to the last   
[2;36m [0m         one. This may cause issues, run with --clean, if you hit an error.    
[2;36m [0m[36mDEBUG   [0m Loaded dictionary in [1;36m35.840[0m seconds                                   
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[36mDEBUG   [0m Could no

[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[36mDEBUG   [0m Processing queue: [1;36m0.04265131500000052[0m                                 
[2;36m [0m[36mDEBUG   [0m Parsed corpus directory with [1;36m3[0m jobs in [1;36m0.055298225000001366[0m seconds   
[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[36mDEBUG   [0m Loaded corpus in [1;36m1.045[0m seconds                                        
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[36mDEBUG   [0m Initialized jobs in [1;36m0.032[0m seconds                                     
[2;36m [

[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[36mDEBUG   [0m Wrote lexicon information in [1;36m23.558[0m seconds                           
[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[36mDEBUG   [0m Generating MFCCs took [1;36m2.227[0m seconds                                   
[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[36mDEBUG   [0m Generating final features took [1;36m1.111[0m seconds                          
[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[36mDEBUG   [0m Generated features in [1;36m4.443[0m seconds                                   
[2;36m [0m[36mDEBUG   [0m Setting up corpus took [1;36m94.342[0m seconds                                 
[2;36m [0m[36mDEBUG   [0m                                                                       
[2;36m [0m[36mDEBUG   [0m ====ACOUSTIC MODEL [33mINFO[0m====                                           
[2;36m [0m[36mDEBUG   [0m Acoustic model root directory:                                        
[2;36m [0m         [35m/users/s1f102201582/Documents/MFA/extracted_models/[0m[95macoustic[0m           
[2;36m [0m[36mDEBUG   [0m Acoustic model directory:                                             
[2;36m [0m         [35m/users/s1f102201582/Documents/MFA/extracted_models/acoustic/[0m[95mjapanese_m[0m
[2;36m [0m         [95mfa_acoustic[0m                                                           
[2;36m [0m[36mDEBUG   [0m Acoustic mod

[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:28[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[36mDEBUG   [0m Aligned [1;36m1[0m, errors on [1;36m0[0m, total [1;36m1[0m                                       
[2;36m [0m[36mDEBUG   [0m Alignment round took [1;36m29.434[0m seconds                                   
[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:04[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[36mDEBUG   [0m Bulk insert took [1;36m0.013[0m seconds                                        
[2;36m [0m[36mDEBUG   [0m Dropping temp tables took [1;36m0.008[0m seconds                               
[2;36m [0m[36mDEBUG   [0m Collecting alignments took [1;36m4.742[0m seconds                              
[2;36m [0m[36mDEBUG   [0m Generated alignments in [1;36m39.734[0m seconds                                
[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:04[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[36mDEBUG   [0m Analyzed alignment quality in [1;36m4.986[0m seconds                           
[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_0...[0m                                                                
[2;36m [0m[36mDEBUG   [0m Not using multiprocessing for TextGrid export                         
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_0[0m!                                                                  
[2;36m [0m[36mDEBUG   [0m Exported TextGrids in a total of [1;36m0.053[0m seconds                        
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m140.511[0

[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[36mDEBUG   [0m Processing queue: [1;36m0.04280826400000137[0m                                 
[2;36m [0m[36mDEBUG   [0m Parsed corpus directory with [1;36m3[0m jobs in [1;36m0.0570258420000016[0m seconds     
[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[36mDEBUG   [0m Loaded corpus in [1;36m1.047[0m seconds                                        
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[36mDEBUG   [0m Initialized jobs in [1;36m0.030[0m seconds                                     
[2;36m [

[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[36mDEBUG   [0m Wrote lexicon information in [1;36m23.893[0m seconds                           
[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[36mDEBUG   [0m Generating MFCCs took [1;36m2.236[0m seconds                                   
[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h[?25l

[2;36m [0m[36mDEBUG   [0m Generating final features took [1;36m1.121[0m seconds                          
[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[36mDEBUG   [0m Generated features in [1;36m4.507[0m seconds                                   
[2;36m [0m[36mDEBUG   [0m Setting up corpus took [1;36m92.958[0m seconds                                 
[2;36m [0m[36mDEBUG   [0m                                                                       
[2;36m [0m[36mDEBUG   [0m ====ACOUSTIC MODEL [33mINFO[0m====                                           
[2;36m [0m[36mDEBUG   [0m Acoustic model root directory:                                        
[2;36m [0m         [35m/users/s1f102201582/Documents/MFA/extracted_models/[0m[95macoustic[0m           
[2;36m [0m[36mDEBUG   [0m Acoustic model directory:                                             
[2;36m [0m         [35m/users/s1f102201582/Documents/MFA/extracted_models/acoustic/[0m[95mjapanese_m[0m
[2;36m [0m         [95mfa_acoustic[0m                                                           
[2;36m [0m[36mDEBUG   [0m Acoustic mod

[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:25[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[36mDEBUG   [0m Aligned [1;36m1[0m, errors on [1;36m0[0m, total [1;36m1[0m                                       
[2;36m [0m[36mDEBUG   [0m Alignment round took [1;36m26.619[0m seconds                                   
[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:03[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[36mDEBUG   [0m Bulk insert took [1;36m0.010[0m seconds                                        
[2;36m [0m[36mDEBUG   [0m Dropping temp tables took [1;36m0.008[0m seconds                               
[2;36m [0m[36mDEBUG   [0m Collecting alignments took [1;36m4.321[0m seconds                              
[2;36m [0m[36mDEBUG   [0m Generated alignments in [1;36m36.479[0m seconds                                
[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[36mDEBUG   [0m Analyzed alignment quality in [1;36m2.037[0m seconds                           
[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_0...[0m                                                                
[2;36m [0m[36mDEBUG   [0m Not using multiprocessing for TextGrid export                         
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_0[0m!                                                                  
[2;36m [0m[36mDEBUG   [0m Exported TextGrids in a total of [1;36m0.025[0m seconds                        
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m132.825[0