# FT用データ生成スクリプト

In [26]:
# !conda install -y -c conda-forge kalpy \
# kaldi \
# pynini

# # パッケージインストール
# !pip install -r requirements.sbv.txt

In [27]:
# !pip list

In [28]:
# # mfa
# # 日本語辞書のダウンロード
# !mfa model download dictionary japanese_mfa

# # 日本語音響モデルのダウンロード
# !mfa model download acoustic japanese_mfa

## テキスト対話データ生成

In [29]:
import os
from typing import Literal
import ast

from dotenv import load_dotenv
from langchain_chroma import Chroma
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders import PDFMinerLoader
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import ChatOpenAI
from langchain_google_genai import ChatGoogleGenerativeAI


# .envファイル読み込み
load_dotenv()

True

In [30]:
#config
from os.path import join, expanduser

OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
BASE_URL = "https://api.openai.iniad.org/api/v1"
MODEL='gemini-2.5-flash'
TEMPERATURE = 1.0
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"

# 生成する音声のサンプリングレート
setting_sr = 16000

#対話音声データの個数を指定
gen_dial_num = 100

# すでに作成した対話データを削除するかどうか
IS_REMOVE_EXIST_FILE = True

# ftに使うjsonとaudioの出力フォルダパス
home_dir = expanduser("~")
json_dir_path = join(home_dir, "Github/jmoshi-ft/gen_dialogue/data/sbv/transcription")
audio_dir_path = join(home_dir, "Github/jmoshi-ft/gen_dialogue/data/sbv/audio")

# mfa関連のパス
model_dir = join(home_dir, "Documents/MFA/pretrained_models/acoustic/japanese_mfa.zip")
mfa_input_dir = join(home_dir, "Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_input")
mfa_output_dir = join(home_dir, "Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output")

In [31]:
base_paths = [
    json_dir_path,
    audio_dir_path,
    mfa_input_dir,
    mfa_output_dir,
]

for p in base_paths:
    if not os.path.isdir(p):
        os.makedirs(p)

In [32]:
# model定義
model = ChatGoogleGenerativeAI(
                 model=MODEL,
                 temperature=TEMPERATURE)

# 埋め込みモデル定義
embeddings = OpenAIEmbeddings(
    openai_api_key=OPENAI_API_KEY,
    openai_api_base=BASE_URL,
    model="text-embedding-3-large"
)

# データベース定義
vector_store = Chroma(
    collection_name="collection",
    embedding_function=embeddings,
    # persist_directory = "/path/to/db_file" # if necessary
)

In [33]:
loader = DirectoryLoader(
    "../../mental_docs/",
    glob="*.pdf",
    show_progress=True,
    loader_cls=PDFMinerLoader,
)
docs = loader.load()
print(f"Loaded {len(docs)} documents")

  0%|                                                                                     | 0/3 [00:00<?, ?it/s]Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P1' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P1' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P1' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P1' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P1' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P1' is an invalid float value
Cannot set g

Loaded 3 documents





In [34]:
# Debug
# for doc in docs:
#     print("-------------------------------------------------")
#     print(doc.metadata)
#     print(len(doc.page_content))
#     print(doc.page_content[:100])

In [35]:
#読み込んだ文章データをオーバーラップ200文字で1000文字づつ分割
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    add_start_index=True, # 分割前の文章のインデックスを追跡
)
splits = text_splitter.split_documents(docs)

# データベースにデータを追加
document_ids = vector_store.add_documents(documents=splits)

In [36]:
from langchain.agents.middleware import dynamic_prompt, ModelRequest

@dynamic_prompt
def prompt_with_context(request: ModelRequest) -> str:
    """Inject context into state messages."""
    last_query = request.state["messages"][-1].text
    retrieved_docs = vector_store.similarity_search(last_query, k=2)

    docs_content = "\n\n".join(doc.page_content for doc in retrieved_docs)

    system_message = (
        "You are a helpful assistant. Use the following context in your response:"
        f"\n\n{docs_content}"
    )

    return system_message

In [37]:
from typing import Literal

from pydantic import BaseModel, Field


class Dialogue(BaseModel):
    """対話データを構成する対話クラス"""
    speaker: Literal["A", "B"] = Field(..., description="話者。Aはカウンセラー、Bはクライエントを表す。")
    text: str = Field(..., description="話者が話した内容。")

class Dialogues(BaseModel):
    """カウンセリングを目的としたカウンセリング対話データ"""
    dialogues: list[Dialogue] = Field(..., description="対話データを構成する対話クラスのリスト。")

In [38]:
from langchain.agents import create_agent
from langchain.agents.structured_output import ToolStrategy

agent = create_agent(
    model, 
    tools=[],
    middleware=[prompt_with_context],
    response_format=ToolStrategy(
        Dialogues,
        handle_errors="フォーマットに合うように、もう一度対話データを生成してください。"
    )
)

In [39]:
#promptを作成
import random


sessions = [
    "【段階：初期】信頼関係を築きつつ、悩みの背景を深掘りするシーン",
    "【段階：中期】クライエントの「すべき思考」に焦点を当て、認知の歪みを扱うシーン",
    "【段階：終結期】これまでのセッションを振り返り、終結に向けて準備するシーン",
]

def gen_prompt_txt():
    choiced = random.randint(0, 2)
    choiced_session = sessions[choiced]
    prompt_txt = f"""メンタルヘルスケアカウンセリングのセッションをシミュレーションしてください。
シミュレーションしたい「段階」と「テーマ」:
{choiced_session}

役割定義:
A (カウンセラー): メンタルヘルスケアの専門知識を持つ経験豊富なカウンセラー。傾聴と共感の姿勢を基本とし、クライエントの言葉を促すように、優しく、自然な話し言葉（「〜ですね」「〜でしたか」など）を使います。
B (クライエント): 仕事上の悩みだけでなく、日常生活全般に対して漠然とした不安や焦りを感じている人物。

対話の要件:
スタイル: 実際の会話の文字起こしのように、堅苦しくない自然な「話し言葉」を使用してください。
相槌 (あいづち): カウンセラー（A）は、クライエント（B）の話を促し、共感を示すため、「ええ」「はい」「そうなんですね」「なるほど」といった細かな相槌を頻繁に、適切なタイミングで挿入してください。
構成: 会話が途中で途切れるのではなく、初回のヒアリングとして「一区切り」がつき、自然に終了する流れにしてください（例：次回の約束、今回のまとめなど）。
分量: 会話の往復は合計12〜20ターン程度、全体の文字数が合計500〜800文字程度になるように構成してください。
"""
    return prompt_txt

In [40]:
# テキスト対話生成関数
def gen_txt_dialogue():
    prompt = gen_prompt_txt()
    resp = agent.invoke({"messages": [{"role": "user", "content": prompt}]})
    dialogues_list = resp["structured_response"].dialogues
    return dialogues_list

In [41]:
#DEBUG
# txt_dialogue = gen_txt_dialogue()
# print(txt_dialogue)
# lst_dialogue = txt_to_lst(txt_dialogue)
# print(lst_dialogue)

## テキスト対話データを音声対話データに変換 

In [42]:
from style_bert_vits2.nlp import bert_models
from style_bert_vits2.constants import Languages
from pathlib import Path
from huggingface_hub import hf_hub_download
from style_bert_vits2.tts_model import TTSModel

bert_models.load_model(Languages.JP, "ku-nlp/deberta-v2-large-japanese-char-wwm")
bert_models.load_tokenizer(Languages.JP, "ku-nlp/deberta-v2-large-japanese-char-wwm")
assets_root = Path("model_assets")

# # 子春音あみ
# model_file = "koharune-ami/koharune-ami.safetensors"
# config_file = "koharune-ami/config.json"
# style_file = "koharune-ami/style_vectors.npy"
# hf_repo = "litagin/sbv2_koharune_ami"

# # あみたろ
# model_file = "amitaro/amitaro.safetensors"
# config_file = "amitaro/config.json"
# style_file = "amitaro/style_vectors.npy"
# hf_repo = "litagin/sbv2_amitaro"


# デフォルトの女性2
model_file = "jvnv-F2-jp/jvnv-F2_e166_s20000.safetensors"
config_file = "jvnv-F2-jp/config.json"
style_file = "jvnv-F2-jp/style_vectors.npy"
hf_repo = "litagin/style_bert_vits2_jvnv"

for file in [model_file, config_file, style_file]:
    print(file)
    hf_hub_download(hf_repo, file, local_dir="model_assets")

A_model = TTSModel(
    model_path=assets_root / model_file,
    config_path=assets_root / config_file,
    style_vec_path=assets_root / style_file,
    device="cuda",
)

# デフォルトの男性2
model_file = "jvnv-M2-jp/jvnv-M2-jp_e159_s17000.safetensors"
config_file = "jvnv-M2-jp/config.json"
style_file = "jvnv-M2-jp/style_vectors.npy"

for file in [model_file, config_file, style_file]:
    print(file)
    hf_hub_download(hf_repo, file, local_dir="model_assets")

B_model = TTSModel(
    model_path=assets_root / model_file,
    config_path=assets_root / config_file,
    style_vec_path=assets_root / style_file,
    device="cuda",
)

jvnv-F2-jp/jvnv-F2_e166_s20000.safetensors
jvnv-F2-jp/config.json
jvnv-F2-jp/style_vectors.npy
jvnv-M2-jp/jvnv-M2-jp_e159_s17000.safetensors
jvnv-M2-jp/config.json
jvnv-M2-jp/style_vectors.npy


In [43]:
from typing import Literal

def sbv_tts(text: str, speaker: Literal["A", "B"], assist_text=None):
    if speaker == "A":
        sr, audio = A_model.infer(
            text = text,
            style='Happy',
            style_weight=1,
            split_interval = 0.3,
            use_assist_text = True if assist_text is not None else None,
            assist_text = assist_text
        )
    else:
        sr, audio = B_model.infer(
            text = text,
            style='Sad',
            style_weight=1,
            split_interval = 0.3,
            use_assist_text = True if assist_text is not None else None,
            assist_text = assist_text
        )
    
    return sr, audio

In [44]:
import librosa
import numpy as np

def gen_audio_dialogue(text_dialogue_list):
    # 音声ファイルを順番に生成（ファイルは不要なのでwave配列で持つ）
    wav_data = []
    for dial in text_dialogue_list:
        speaker = dial.speaker
        sr, wav = sbv_tts(dial.text, speaker)
        print(wav.shape)

        # サンプリングレートを変換
        if sr != setting_sr:
            # 16ビット整数のデータを、-1.0から1.0の範囲に収まる浮動小数点数に正規化
            wav = wav.astype(np.float32) / 32768.0
            wav = librosa.resample(wav, orig_sr=sr, target_sr=setting_sr)

        # 0.3秒間の無音時間を追加
        duration_sec = 0.3
        num_silent_samples = int(setting_sr*duration_sec)
        silence = np.zeros(num_silent_samples, dtype=wav.dtype)
        wav_with_silence = np.concatenate((wav, silence))
        wav_data.append(wav_with_silence)
    
    # 最終的な音声長を決定
    max_len = sum([len(w) for w in wav_data])
    
    # ステレオ音声用（2チャンネル×最大長）の空配列をゼロ初期化で作成
    stereo = np.zeros((2, max_len), dtype=np.float32)
    
    pos = 0
    for i, wav in enumerate(wav_data):
        ch = i%2  # 0:左(A), 1:右(B)
        stereo[ch, pos:pos+len(wav)] += wav
        pos += len(wav)
    
    # 転置(-1,2)する
    stereo = stereo.T
    return stereo

## mfa(montreal force alignment)による音声アラインメント

In [45]:
import MeCab
import re

# 句読点のパターン
PUNCT_RE = re.compile(r'^[。、,.!?！？…]+$')

def tokenize_text(text, is_punct_isolated=False):
    tokens = []
    punct_dict = {}
    checked_punct_pos = 0
    try:
        # MeCabのタガーを初期化
        tagger = MeCab.Tagger()

        # MeCabは内部でShift-JISやEUC-JPを期待することがあるため、
        # UnicodeDecodeErrorを避けるために明示的にUTF-8でエンコード・デコードする
        # parseToNodeは、より詳細な情報をノードオブジェクトとして取得できるメソッド
        node = tagger.parseToNode(text)
        while node:
            if not node.surface:
                pass
                
            elif not is_punct_isolated and PUNCT_RE.match(node.surface) and tokens:
                punct_dict[checked_punct_pos] = node.surface
                
                checked_punct_pos += len(node.surface)
                # 句読点なら直前トークンに連結
                tokens[-1] += node.surface
            else:
                checked_punct_pos += len(node.surface)
                
                # 通常トークンはそのまま追加
                tokens.append(node.surface)
            node = node.next
    except RuntimeError as e:
        print(f"MeCabの実行中にエラーが発生しました: {e}", file=sys.stderr)
        
    return tokens, punct_dict

In [46]:
def generate_txt_file_using_mecab(input_txt, path):
    tokens, punct_dict = tokenize_text(input_txt)
    output = ""
    for token in tokens:
        output += token + "\n"
        
    with open(path, "w", encoding="utf-8") as f:
        f.write(output)
    return tokens, punct_dict

In [47]:
from os.path import join, expanduser
import subprocess
import json

def alignment_channel(channel, txt, target_dir_name):
    input_dir_path = join(mfa_input_dir, target_dir_name)
    output_dir_path = join(mfa_output_dir, target_dir_name)
    os.makedirs(input_dir_path, exist_ok=True)
    os.makedirs(output_dir_path, exist_ok=True)
    
    for_align_audio_path = join(input_dir_path, f"{target_dir_name}.wav")
    for_align_txt_path = join(input_dir_path, f"{target_dir_name}.txt")

    sf.write(for_align_audio_path, channel, setting_sr)
    _, punct_dict = generate_txt_file_using_mecab(txt, for_align_txt_path)
    subprocess.run([
        "mfa",
        "align",
        input_dir_path,
        "japanese_mfa",
        model_dir,
        output_dir_path,
        "--",
        "--overwrite",
        "--clean",
        "----final_clean",
        "--output_format", "json",
        "--beam", "1000",
        "--retry_beam", "4000",
        "--punctuation", "…",
    ])
    return punct_dict

def json_formatter_for_ft(align_json_A, align_json_B):
    json = []

    segments_A = align_json_A["tiers"]["words"]["entries"]
    segments_B = align_json_B["tiers"]["words"]["entries"]
    for segment in segments_A:
        json.append({
            "speaker": "A",
            "word": segment[2],
            "start": segment[0],
            "end": segment[1],
        })
    for segment in segments_B:
        json.append({
            "speaker": "B",
            "word": segment[2],
            "start": segment[0],
            "end": segment[1],
        })
    sorted_json = sorted(json, key=lambda seg: seg["start"])
    return sorted_json

def lst_to_line_str(lst):
    result = ""
    for s in lst:
        result += s
    return result
    
def alignment_audio_dialogue(text_dialogue_list, audio_path, idx):
    # ステレオ分離: speaker A=左(0), B=右(1)と仮定
    audio, sr = sf.read(audio_path)    # (samples, channels)
    channel_A = audio[:,0]
    channel_B = audio[:,1]
    txt_lst_A = []
    txt_lst_B = []
    for txt_dial in text_dialogue_list:
        if txt_dial.speaker == "A":
            txt_lst_A.append(txt_dial.text)
        else:
            txt_lst_B.append(txt_dial.text)
    A_full_txt = lst_to_line_str(txt_lst_A)
    B_full_txt = lst_to_line_str(txt_lst_B)
    
    target_dir_name_A = f"A_{idx}"
    target_dir_name_B = f"B_{idx}"
    punct_dict_A = alignment_channel(channel_A, A_full_txt, target_dir_name_A)
    punct_dict_B = alignment_channel(channel_B, B_full_txt, target_dir_name_B)
    json_path_A = join(mfa_output_dir, target_dir_name_A, f"{target_dir_name_A}.json")
    json_path_B = join(mfa_output_dir, target_dir_name_B, f"{target_dir_name_B}.json")
    with open(json_path_A, "r") as f:
        json_A = json.load(f)
    with open(json_path_B, "r") as f:
        json_B = json.load(f)
    
    ft_json = json_formatter_for_ft(json_A, json_B)

    return ft_json

## フォルダ初期化

In [48]:
import re

def get_file_name():
    wav_file_pattern = r"^(\d+)\.wav$"
    num = -1
    for file in os.listdir(audio_dir_path):
        if not os.path.exists(os.path.join(audio_dir_path, file)):
            continue
        if not re.match(wav_file_pattern, file):
            continue
    
        match_obj = re.match(wav_file_pattern, file)
        get_number = int(match_obj.groups()[0])
    
        if num < get_number:
            num = get_number
    return num

In [49]:
from glob import glob
import shutil

def delete_files(dir_path):
    shutil.rmtree(dir_path)
    os.makedirs(dir_path)

if IS_REMOVE_EXIST_FILE:
    file_name_num = -1
    for dir_path in base_paths:
        delete_files(dir_path)
else:
    file_name_num = get_file_name()

## メイン処理

In [50]:
%%time

import soundfile as sf
import json

for i in range(file_name_num+1, gen_dial_num+file_name_num+1):

    txt_dialogue_list = gen_txt_dialogue()
    stereo = gen_audio_dialogue(txt_dialogue_list)
    
    wav_name = f"{i}.wav"
    audio_file_path = os.path.join(audio_dir_path, wav_name)

    # wavファイル出力
    sf.write(audio_file_path, stereo, setting_sr)

    json_data = alignment_audio_dialogue(txt_dialogue_list, audio_file_path, i)

    json_name = f"{i}.json"
    json_file_path = os.path.join(json_dir_path, json_name)
    
    # JSON出力
    with open(json_file_path, 'w', encoding='utf-8') as f:
        json.dump(json_data, f, ensure_ascii=False, indent=2)

[32m10-25 19:44:40[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
今日は、お越しくださってありがとうございます。何か、今お話ししたいこと、ありますか？
[32m10-25 19:44:40[0m |[1m  INFO  [0m| infer.py:24 | Using JP-Extra model


  WeightNorm.apply(module, name, dim)


[32m10-25 19:44:41[0m |[1m  INFO  [0m| safetensors.py:50 | Loaded 'model_assets/jvnv-F2-jp/jvnv-F2_e166_s20000.safetensors' (iteration 166)
[32m10-25 19:44:41[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(275968,)
[32m10-25 19:44:41[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、ありがとうございます。最近、仕事のことや、日常生活全体に漠然とした不安や焦りを感じることが多くて…。
[32m10-25 19:44:41[0m |[1m  INFO  [0m| infer.py:24 | Using JP-Extra model
[32m10-25 19:44:42[0m |[1m  INFO  [0m| safetensors.py:50 | Loaded 'model_assets/jvnv-M2-jp/jvnv-M2-jp_e159_s17000.safetensors' (iteration 159)
[32m10-25 19:44:42[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(401408,)
[32m10-25 19:44:42[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そうなんですね。漠然とした不安や焦り、ですか。ええ。どんな時に強く感じられますか？
[32m10-25 19:44:42[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(324096,)
[32m10-25 19:4

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:31[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:04[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_0...[0m                                                                
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_0[0m!                                                                  
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m141.009[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:27[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_0...[0m                                                                
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_0[0m!                                                                  
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m139.006[0m seconds                                 


[32m10-25 19:50:22[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
Bさん、今日はこれまでのセッションを振り返り、終結に向けて準備していきましょうか。
[32m10-25 19:50:22[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(258560,)
[32m10-25 19:50:22[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、もうそんな時期なんですね。少し寂しい気持ちもあります。
[32m10-25 19:50:22[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(223232,)
[32m10-25 19:50:22[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
ええ、そうなんですね。最初、漠然とした不安でしたね。この数ヶ月、Bさんにとってどんな時間でしたか？
[32m10-25 19:50:22[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(369664,)
[32m10-25 19:50:22[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい。本当に何から手をつけていいか分からず、悩みが漠然としていました。
[32m10-25 19:50:23[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(244224,)
[32m10-25 19

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:16[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:04[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_1...[0m                                                                
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_1[0m!                                                                  
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m123.378[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:15[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:04[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_1...[0m                                                                
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_1[0m!                                                                  
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m125.590[0m seconds                                 


[32m10-25 19:55:11[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
今日は、これまでのセッションを少し振り返りながら、今後のお話をしていきましょうか。〇〇様、いかがですか？これまでの時間で、何かご自身の中で変化したな、と感じることはありますか？
[32m10-25 19:55:11[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(589312,)
[32m10-25 19:55:11[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい...そうですね。最初は漠然と「性格を直したい」って思ってたんですけど、今はもう少し具体的に、どういう時に不安になるのか、どう考えちゃうのか、少しずつわかるようになってきた気がします。
[32m10-25 19:55:11[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(649216,)
[32m10-25 19:55:11[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そうなんですね。ええ、なるほど。漠然とした不安が、より具体的に見えてきた、ということですね。素晴らしい気づきですね。
[32m10-25 19:55:12[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(431104,)
[32m10-25 19:55:12[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい。特に、以前お話しした、集団面接でのディスカッションの時、「私は口



[32m10-25 19:55:14[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(824320,)
[32m10-25 19:55:14[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、ありがとうございます。よろしくお願いします。
[32m10-25 19:55:14[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(183296,)
[32m10-25 19:55:14[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、こちらこそ、ありがとうございました。
[32m10-25 19:55:14[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(128512,)


[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:36[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:06[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_2...[0m                                                                
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_2[0m!                                                                  
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m147.451[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:33[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_2...[0m                                                                
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_2[0m!                                                                  
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m141.964[0m seconds                                 


[32m10-25 20:00:35[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは、Bさん。今日はどんなことをお話ししましょうか。
[32m10-25 20:00:35[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(169984,)
[32m10-25 20:00:35[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは。なんだか、最近ずっと焦りを感じていて…仕事もプライベートも、もっと「こうあるべき」っていう気持ちが強いんです。
[32m10-25 20:00:35[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(416256,)
[32m10-25 20:00:35[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そうですか。「こうあるべき」という気持ちが、はい。具体的には、どんな時にそう感じることが多いですか？
[32m10-25 20:00:36[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(337408,)
[32m10-25 20:00:36[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
ええ。例えば、仕事だと「もっと完璧な資料を作らなければならない」とか、家では「もっと家事をきちんとすべきだ」とか…いつも頭の中で、そう思ってしまうんです。
[32m10-25 20:00:36[0m |[1m  INFO  [0m| tts_model.py:3



[32m10-25 20:00:36[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(472064,)
[32m10-25 20:00:36[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そうですね…すごくプレッシャーを感じて、息苦しくなります。結局、どれも中途半端に終わってしまって、また「だめだ」って自分を責めてしまうんです。
[32m10-25 20:00:36[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(487424,)
[32m10-25 20:00:36[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
それは、とてもお辛いですね。はい。完璧を求めようと頑張っていらっしゃるのに、かえってご自身を責めてしまう、と。
[32m10-25 20:00:36[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(402944,)
[32m10-25 20:00:36[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、まさにそんな感じです。どうしたらいいのか、分からなくて。
[32m10-25 20:00:37[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(215552,)
[32m10-25 20:00:37[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そうなんですね。Bさんの



[32m10-25 20:00:37[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(809472,)
[32m10-25 20:00:37[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
…はい、そうかもしれません。考えてみたいです。
[32m10-25 20:00:37[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(173056,)
[32m10-25 20:00:37[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
ありがとうございます。では、今日のところは、Bさんがおっしゃった「〜すべき」というお気持ちについて、それがどんな時に、どんな風にBさんに影響を与えているのか、少しお話を聞かせていただきました。次回は、もう少し掘り下げて、その「〜すべき」の裏側にあるBさんの大切にされていることなども、一緒に見ていけたらと思いますが、いかがでしょうか。




[32m10-25 20:00:37[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(1113600,)
[32m10-25 20:00:37[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、お願いします。少し、気持ちが楽になりました。
[32m10-25 20:00:37[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(187392,)
[32m10-25 20:00:37[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
よかったです。ええ。では、次回は来週の同じ時間でよろしいでしょうか。
[32m10-25 20:00:38[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(222208,)
[32m10-25 20:00:38[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、大丈夫です。
[32m10-25 20:00:38[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(75776,)
[32m10-25 20:00:38[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
承知いたしました。では、今日はこれで終わりにしましょう。お気をつけてお帰りくださいね。
[32m10-25 20:00:38[0m |[1m  INFO  [0m| tts_model.py:3

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:28[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:04[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_3...[0m                                                                
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_3[0m!                                                                  
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m139.922[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:31[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_3...[0m                                                                
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_3[0m!                                                                  
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m140.959[0m seconds                                 


[32m10-25 20:05:53[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは、Bさん。前回の続きになりますが、最近何か気になっていることや、お話ししたいことはありますか？
[32m10-25 20:05:53[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(357376,)
[32m10-25 20:05:53[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、先生。最近、なんだか仕事でもプライベートでも、『もっとこうあるべきだ』と考えてしまって、すごく疲れるんです。
[32m10-25 20:05:53[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(373760,)
[32m10-25 20:05:53[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
なるほど。『もっとこうあるべきだ』、ですか。ええ、具体的にどのような時にそう感じることが多いですか？
[32m10-25 20:05:53[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(342528,)
[32m10-25 20:05:53[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そうですね…。例えば、仕事で新しいプロジェクトを任された時も、『完璧にこなさなければならない』って強く思ってしまって。少しでもうまくいかないと、『自分はダメだ』って責めてしまうんです。
[32m10-25 20:05:54[0m

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:38[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_4...[0m                                                                
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_4[0m!                                                                  
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m147.910[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:33[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:04[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_4...[0m                                                                
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_4[0m!                                                                  
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m144.223[0m seconds                                 


[32m10-25 20:11:24[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは、〇〇さん。本日は来てくださってありがとうございます。今日はどんなことでお困りですか？
[32m10-25 20:11:24[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(280576,)
[32m10-25 20:11:24[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは。なんだか、うまく話せるか不安ですが…。最近、仕事のことで漠然とした不安や焦りを感じていて。日常生活全般にもモヤモヤするんです。
[32m10-25 20:11:24[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(495104,)
[32m10-25 20:11:24[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そうなんですね。お仕事での不安や焦り、そして日常生活全体へのモヤモヤ、ですか。それはお辛いですね。具体的に、どんな時にそういった気持ちになりますか？
[32m10-25 20:11:24[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(575488,)
[32m10-25 20:11:24[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
うーん…新しいプロジェクトが始まるときなんかは、失敗したらどうしようって。休日も、何かしないとって焦るばかりで、結局何も手につかなくて。
[32m10-25 20:1

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:21[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:04[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_5...[0m                                                                
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_5[0m!                                                                  
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m130.219[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:23[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:04[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_5...[0m                                                                
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_5[0m!                                                                  
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m133.798[0m seconds                                 


[32m10-25 20:16:38[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは。今日はよくいらっしゃいましたね。どうぞ楽にしてください。
[32m10-25 20:16:38[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(221696,)
[32m10-25 20:16:38[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
ありがとうございます。少し緊張して…仕事のことが一番で、最近、全部うまくいかない気がします。
[32m10-25 20:16:38[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(346112,)
[32m10-25 20:16:38[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そうなんですね。ええ。お仕事が一番の悩みなんですね。
[32m10-25 20:16:39[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(200192,)
[32m10-25 20:16:39[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
新しいプロジェクトで常に追われています。家でも仕事が頭から離れず、夜も眠れなくて。
[32m10-25 20:16:39[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(306688,)
[32m10-25 20:16:39[

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:18[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:04[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_6...[0m                                                                
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_6[0m!                                                                  
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m129.702[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:13[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:04[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_6...[0m                                                                
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_6[0m!                                                                  
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m121.544[0m seconds                                 


[32m10-25 20:22:09[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
〇〇さん、こんにちは。今日はこれまでのセッションを振り返りながら、終結に向けてのお話をできたらと思っていますが、いかがでしょうか。
[32m10-25 20:22:09[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(412672,)
[32m10-25 20:22:09[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、もうそんな時期なんですね。あっという間でしたが、本当に色々なことが変わったように感じます。
[32m10-25 20:22:09[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(328192,)
[32m10-25 20:22:09[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
ええ、そうですね。はい。最初にお越しになった頃は、漠然とした不安を抱えていらっしゃいましたね。今、特にどんな変化を感じていらっしゃいますか。
[32m10-25 20:22:09[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(478208,)
[32m10-25 20:22:09[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
一番は、自分の気持ちに気づけるようになったこと、でしょうか。感情の理由が少し冷静に捉えられるようになりました。
[32m10-25 20:22:10[0m |[1m  INFO 

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:22[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_7...[0m                                                                
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_7[0m!                                                                  
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m131.632[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:20[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:04[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_7...[0m                                                                
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_7[0m!                                                                  
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m130.822[0m seconds                                 


[32m10-25 20:27:27[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは。今日はよく来てくださいましたね。どうぞ楽にしてください。
[32m10-25 20:27:27[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(227840,)
[32m10-25 20:27:27[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、緊張しています。
[32m10-25 20:27:27[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(87040,)
[32m10-25 20:27:27[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
ええ、そうですよね。初めての場所は緊張しますものね。今日はどんなことをお話ししたい気分ですか？
[32m10-25 20:27:27[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(329728,)
[32m10-25 20:27:27[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
最近、なんだか漠然とした不安があって、仕事もプライベートも、このままでいいのかなって焦るんです。
[32m10-25 20:27:27[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(334336,)
[32m10-25 20:27:27[0m |[1m

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:20[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:04[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_8...[0m                                                                
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_8[0m!                                                                  
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m130.633[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:17[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:06[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_8...[0m                                                                
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_8[0m!                                                                  
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m128.747[0m seconds                                 


[32m10-25 20:32:24[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
Bさん、こんにちは。今日はこれまでのセッションを少し振り返ってみませんか。ここまで、色々なことをお話しくださいましたものね。
[32m10-25 20:32:24[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(408064,)
[32m10-25 20:32:24[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、そうですね。ええ。最初は漠然とした不安でいっぱいでした。仕事のことも、プライベートのことも、なんだか全部がうまくいかないような気がして…。
[32m10-25 20:32:24[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(527872,)
[32m10-25 20:32:24[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
ええ、そうでしたね。漠然とした不安、そしてお仕事のことや、日常生活での焦り…「どうしたらいいんだろう」というお気持ちをたくさんお話ししてくださいましたね。はい。
[32m10-25 20:32:24[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(575488,)
[32m10-25 20:32:24[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい。でも、ここにきて先生とお話しするうちに、少しずつ「何が不安の原因なのか」とか、「どうして焦ってしまうのか」といった、

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:45[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_9...[0m                                                                
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_9[0m!                                                                  
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m155.832[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:43[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:04[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_9...[0m                                                                
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_9[0m!                                                                  
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m150.869[0m seconds                                 


[32m10-25 20:38:32[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
Bさん、こんにちは。前回の後、何か気づかれたことはありましたか？
[32m10-25 20:38:32[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(243200,)
[32m10-25 20:38:32[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、常に「もっとこうしなきゃ」とか「もっと頑張らなきゃ」って焦っている自分がいて、それがしんどいです。
[32m10-25 20:38:32[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(293888,)
[32m10-25 20:38:32[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
「もっとこうしなきゃ、頑張らなきゃ」というお気持ちですね。ええ。どんな時に特に強く感じられますか？
[32m10-25 20:38:33[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(317440,)
[32m10-25 20:38:33[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
仕事中も、家でも「部屋をきれいにすべきだ」とか「もっと趣味に時間を使うべきなのに」って、考えてしまいます。




[32m10-25 20:38:33[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(337920,)
[32m10-25 20:38:33[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
なるほど。仕事でも自宅でも、「〜すべきだ」という考えが巡るのですね。そういった時、どんな風に感じますか？
[32m10-25 20:38:33[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(379904,)
[32m10-25 20:38:33[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
焦りや、「自分はダメだ」という不安な気持ちになります。全然できてないって。




[32m10-25 20:38:33[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(254464,)
[32m10-25 20:38:33[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
焦りや不安、ご自身を責める気持ちに繋がるのですね。その「すべき」という考えが、Bさんを苦しめていると感じることは？
[32m10-25 20:38:33[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(403968,)
[32m10-25 20:38:33[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
まさにそうです。わかっていても、止められなくて……。
[32m10-25 20:38:33[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(164864,)
[32m10-25 20:38:33[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
ええ、そうなんですね。この「〜すべきだ」という考え、Bさんの頑張り屋さんの証拠でもありますね。でも、それが時にBさんを追い詰めているのかもしれません。今日は、この「すべき思考」の影響についてお話しできました。




[32m10-25 20:38:34[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(712192,)
[32m10-25 20:38:34[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい。少し客観的に見られた気がします。
[32m10-25 20:38:34[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(143360,)
[32m10-25 20:38:34[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
大きな気づきですね。次回は、この「すべき思考」と少し距離を置くための方法を、一緒に考えてみましょう。
[32m10-25 20:38:34[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(354304,)
[32m10-25 20:38:34[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、ぜひお願いします。
[32m10-25 20:38:34[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(88064,)
[32m10-25 20:38:34[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
ありがとうございます。では、今日のセッションはここまでとさせていただきます。お疲れ様でした。
[32m10-25 20:38:35[0m |[1m  INFO  [0m|

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:16[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:04[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_10...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_10[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m124.800[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:17[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_10...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_10[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m128.002[0m seconds                                 


[32m10-25 20:43:17[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは。本日はお越しいただきありがとうございます。今日はどんなことをお話ししたい気分ですか？
[32m10-25 20:43:17[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(313344,)
[32m10-25 20:43:17[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは。あの、なんだか最近、色々と漠然とした不安や焦りを感じていて…何から話したらいいのか、自分でもよくわからないんです。
[32m10-25 20:43:17[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(459264,)
[32m10-25 20:43:17[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そうなんですね。漠然とした不安や、焦りを感じていらっしゃるんですね。ええ。自分でも何から話したらいいか分からない、というお気持ち、よく分かりますよ。もしよろしければ、今一番、頭に浮かんでいることや、最近気になっていることから、お話しいただけますか？
[32m10-25 20:43:18[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(868352,)
[32m10-25 20:43:18[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい…そうですね。仕事のこともそうなんですけど、なんかこう、家でも落ち着かなくて

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:39[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:04[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_11...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_11[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m150.466[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:44[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:04[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_11...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_11[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m152.175[0m seconds                                 


[32m10-25 20:48:51[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは、今日は来てくださってありがとうございます。今日はどんなことをお話ししたいですか？ゆっくりお話しくださいね。
[32m10-25 20:48:51[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(372224,)
[32m10-25 20:48:51[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは。あの、どこから話したらいいのか、ちょっと漠然としすぎているんですけど…最近、なんだかずっと不安で、焦っているような気がして。仕事もプライベートも、全部うまくいっていない気がするんです。
[32m10-25 20:48:51[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(671744,)
[32m10-25 20:48:51[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、そうなんですね。漠然とした不安や焦りを感じていらっしゃるんですね。ええ、大丈夫ですよ。焦らなくても、ゆっくりで構いませんからね。具体的に、どんな時にそのように感じることが多いですか？
[32m10-25 20:48:52[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(675840,)
[32m10-25 20:48:52[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そうですね…仕事中はもちろん、家に帰ってからも、

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:45[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:04[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_12...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_12[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m156.014[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h[?25l

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:50[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_12...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_12[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m160.648[0m seconds                                 


[32m10-25 20:54:56[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは、〇〇さん。今日はどのようなことについてお話ししたいですか？
[32m10-25 20:54:56[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(190976,)
[32m10-25 20:54:56[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは。最近、漠然とした不安と焦りがあって。仕事も家でも落ち着かないんです。
[32m10-25 20:54:56[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(286208,)
[32m10-25 20:54:56[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そうですか。漠然とした不安や焦りですね。仕事でも家でも、ですか。ええ、お辛いですね。
[32m10-25 20:54:57[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(335872,)
[32m10-25 20:54:57[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい。常に『何かしないと』って、頭のどこかで思ってる気がして。
[32m10-25 20:54:57[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(218112,)
[32m10-25 20:54:57

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:22[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:04[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_13...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_13[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m131.391[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:15[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:06[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_13...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_13[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m127.027[0m seconds                                 


[32m10-25 21:00:26[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは。今日はよくお越しくださいましたね。何か、お話ししたいことなどありますか？
[32m10-25 21:00:27[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(276992,)
[32m10-25 21:00:27[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは。漠然とした不安がずっとあって。仕事もプライベートも、落ち着かないんです。
[32m10-25 21:00:27[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(290304,)
[32m10-25 21:00:27[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そうなんですね。全体的に落ち着かない、漠然とした不安…ええ、そうでしたか。
[32m10-25 21:00:27[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(308224,)
[32m10-25 21:00:27[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい。仕事でミスが増えたり、集中できなかったり。休みの日も「何かやらなきゃ」って焦るばかりで。
[32m10-25 21:00:27[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(328704,)




[32m10-25 21:00:28[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(227840,)
[32m10-25 21:00:28[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
ええ、ご自身の変化を感じ、責める気持ちもあるのですね。今日は、ここまでお話しいただけて良かったです。
[32m10-25 21:00:28[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(356864,)
[32m10-25 21:00:28[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
次回、もう少し詳しく伺えたらと思いますが、いかがでしょうか。
[32m10-25 21:00:28[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(212480,)
[32m10-25 21:00:28[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、ぜひお願いします。少し話しただけでも、気持ちが楽になりました。
[32m10-25 21:00:28[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(255488,)
[32m10-25 21:00:28[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
それは良かったです。では、次回の予約についてですが…。
[32m10-25 21:00:29[0m |[

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:14[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:04[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_14...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_14[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m125.217[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h[?25l

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:13[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:04[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_14...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_14[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m123.594[0m seconds                                 


[32m10-25 21:05:42[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは、Bさん。今日はどんな感じですか？
[32m10-25 21:05:43[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(132608,)
[32m10-25 21:05:43[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは、Aさん。毎日「〜しなきゃ」って焦る気持ちが強くて、疲れます。
[32m10-25 21:05:43[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully




(242176,)
[32m10-25 21:05:43[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
なるほど、ええ。「〜しなきゃ」というお気持ちですね。どんな時にそう感じますか？
[32m10-25 21:05:43[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully




(259072,)
[32m10-25 21:05:43[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、仕事中です。「完璧に」「すぐ返信」とか、全部「〜すべき」って考えちゃって。




[32m10-25 21:05:43[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(297472,)
[32m10-25 21:05:43[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そうなんですね。その「〜すべき」がBさんを追い詰めているように聞こえます。どんなお気持ちですか？




[32m10-25 21:05:43[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(302080,)
[32m10-25 21:05:43[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
ええ、息苦しくて、できないと「ダメだ」って、落ち込んじゃうんです。
[32m10-25 21:05:44[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(186368,)
[32m10-25 21:05:44[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
なるほど、お辛いですね。ご自身を苦しめている感覚でしょうか。
[32m10-25 21:05:44[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(230400,)
[32m10-25 21:05:44[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、まさに。完璧じゃないと、評価されないんじゃないかって不安で。
[32m10-25 21:05:44[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(208896,)
[32m10-25 21:05:44[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
ええ。「評価されない」不安から、「〜すべき」が強くなるのかもしれませんね。完璧でなくても大丈夫、と思えたら、少し楽になれそうですか？




[32m10-25 21:05:44[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(463872,)
[32m10-25 21:05:44[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
うーん……そう言われると、そうですね。でも、難しいです。
[32m10-25 21:05:44[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(210432,)
[32m10-25 21:05:44[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、すぐには難しいですよね。でも、その「〜すべき」に気づくのが大切なんです。次回までに、「本当にそうかな？」と、少し立ち止まって考えてみませんか？




[32m10-25 21:05:45[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(535552,)
[32m10-25 21:05:45[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、やってみます。
[32m10-25 21:05:45[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(78336,)
[32m10-25 21:05:45[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
ええ。今日のセッションで、ご自身の考えに気づけたのは大きな一歩です。では、次回の予約ですが……。
[32m10-25 21:05:45[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(338944,)


[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:15[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:04[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_15...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_15[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m123.296[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:13[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_15...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_15[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m122.651[0m seconds                                 


[32m10-25 21:10:22[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは、〇〇さん。今日は来てくださってありがとうございます。今日はどんなことをお話しいただけますか？
[32m10-25 21:10:23[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(288768,)
[32m10-25 21:10:23[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは。あの、最近、なんだか漠然と不安な気持ちが続いていて、何から話したらいいのか…正直、よくわからなくて。
[32m10-25 21:10:23[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(458752,)
[32m10-25 21:10:23[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そうなんですね。漠然とした不安な気持ちが続いているんですね。ええ、大丈夫ですよ。話したいことから、ゆっくりお話しくださいね。私も一緒に考えさせていただきますから。
[32m10-25 21:10:23[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(601600,)
[32m10-25 21:10:23[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい…。仕事のことももちろんあるんですが、家での時間もなんとなく落ち着かないというか、常に焦っているような感じがして…休むべき時も、休めている気がしないんです。
[32m

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:37[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:04[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_16...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_16[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m147.306[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:35[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_16...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_16[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m146.233[0m seconds                                 


[32m10-25 21:16:02[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
〇〇さん、今日はセッションの振り返りをしてみませんか。終結に向けて、どんな変化があったか、お話しいただけると嬉しいです。
[32m10-25 21:16:02[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(387072,)
[32m10-25 21:16:02[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい。最初は本当に漠然とした不安ばかりで、どうしたらいいか分からなかったです。
[32m10-25 21:16:03[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(262144,)
[32m10-25 21:16:03[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、漠然とした不安でしたね。そうでした。
[32m10-25 21:16:03[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(154624,)
[32m10-25 21:16:03[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
ええ。でも、先生と話す中で、何に不安を感じているのか、具体的に見えてきた気がします。
[32m10-25 21:16:03[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(290816,)
[32m1

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:21[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_17...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_17[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m130.001[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:19[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_17...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_17[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m129.526[0m seconds                                 


[32m10-25 21:21:38[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
Bさん、こんにちは。今日はどのようなお話しから始めましょうか？
[32m10-25 21:21:38[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(206848,)
[32m10-25 21:21:38[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい。最近、常に落ち着かなくて…仕事もプライベートも、「こうすべき」「こうしなきゃ」って考えて、疲れてしまうんです。
[32m10-25 21:21:38[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(400384,)
[32m10-25 21:21:38[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
「〜すべき」「〜しなきゃ」というお気持ちが強いのですね。ええ、お疲れなんですね。
[32m10-25 21:21:39[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(249856,)
[32m10-25 21:21:39[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい。会社で資料を作る時も、完璧じゃないとダメだと。少しの抜けでも「自分はダメだ」と落ち込むんです。周りはもっと要領よくこなしてるのに、って。




[32m10-25 21:21:39[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(512000,)
[32m10-25 21:21:39[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
完璧でないとダメだと感じ、ご自身を責めるのですね。周りと比べてお辛い気持ち、よくわかります。
[32m10-25 21:21:39[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(379392,)
[32m10-25 21:21:39[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そうなんです。休日も休みたいのに、「何か生産的なことをしなきゃ」って焦って。何もできないと自分にイライラしてしまいます。
[32m10-25 21:21:39[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(400896,)
[32m10-25 21:21:39[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
ええ、休日も「何か生産的なことをしなきゃ」という気持ちが強く、心が休まらない感覚でしょうか。
[32m10-25 21:21:39[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(344064,)
[32m10-25 21:21:39[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
まさに。休んでも休んだ気がしないと



(481280,)
[32m10-25 21:21:40[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
今日は具体的にお話しくださり、ありがとうございました。次回は、この考え方について、どんな時に強く感じるのか、それがBさんの心にどんな影響を与えているのか、一緒に見ていけたらと思います。いかがですか？
[32m10-25 21:21:40[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(723968,)
[32m10-25 21:21:40[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、ありがとうございます。話せて少し楽になった気がします。ぜひ、そうさせてください。
[32m10-25 21:21:40[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(310272,)
[32m10-25 21:21:40[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
良かったです。では、次回の予約についてですが…
[32m10-25 21:21:40[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(161280,)


[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:24[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:04[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_18...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_18[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m135.148[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:17[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_18...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_18[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m129.274[0m seconds                                 


[32m10-25 21:26:39[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは。今日はよくいらっしゃいましたね。お話しできる範囲で構いませんので、今日はどんなことでしょうか？
[32m10-25 21:26:39[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(339456,)
[32m10-25 21:26:39[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、こんにちは。最近、仕事のこととか、なんだか漠然とした不安がずっとあって…。何から話したらいいのか、自分でもよくわからなくて。
[32m10-25 21:26:39[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(476160,)
[32m10-25 21:26:39[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そうなんですね。漠然とした不安、ですか。はい、無理にまとめる必要はないですよ。少しずつお話を聞かせていただけますか？
[32m10-25 21:26:40[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(428544,)
[32m10-25 21:26:40[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
ええ。なんだか、周りの人はみんなちゃんとやっているように見えるのに、自分だけ置いていかれているような気がして。仕事でも、小さなミスが気になって、ずっと引きずってしまうんです。
[32m10-25 

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:31[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_19...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_19[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m140.255[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:31[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:04[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_19...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_19[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m141.163[0m seconds                                 


[32m10-25 21:32:19[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
最近、どうも気持ちが落ち着かなくて。仕事もプライベートも、もっときちんとこなさなきゃいけないのに、全然できてないなって感じがして。
[32m10-25 21:32:20[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(407552,)
[32m10-25 21:32:20[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そうなんですね。気持ちが落ち着かない日々が続いているのですね。ええ。もっと「きちんとこなさなきゃ」というお気持ちなんですね。
[32m10-25 21:32:20[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(392192,)
[32m10-25 21:32:20[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい。みんなはもっとテキパキやってるはずなのに、自分だけ取り残されてるような気がして。もっと頑張らないと、っていつも自分を追い込んじゃうんです。
[32m10-25 21:32:20[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(431104,)
[32m10-25 21:32:20[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
なるほど。周りの方と比べて「自分だけが」と感じていらっしゃるのですね。そして、「もっと頑張らないと」と、ご自身を追い込んでしまうと。ええ。
[32m1



[32m10-25 21:32:21[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(473088,)
[32m10-25 21:32:21[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
うーん…なんか、背中を押されるような、でもちょっと重い、みたいな。やらなきゃいけないって分かってるから、やらないとダメだって。
[32m10-25 21:32:21[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(389632,)
[32m10-25 21:32:21[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、はい。背中を押されるような気持ちと、少し重いお気持ちですね。もし、少しだけ「やらなくてもいい」と許してあげるとしたら、どう感じられるでしょう？
[32m10-25 21:32:21[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(489472,)
[32m10-25 21:32:21[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
でも、やらなかったら、もっと状況が悪くなるんじゃないかって、不安で。
[32m10-25 21:32:21[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(223232,)
[32m10-25 21:32:21[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text



[32m10-25 21:32:22[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(109056,)
[32m10-25 21:32:22[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
ええ。今日は、Bさんの「〜すべき」というお気持ちについて、お話ができましたね。次回は、それがどんな場面で出てくるのか、もう少し詳しくお伺いしてもよろしいでしょうか。




[32m10-25 21:32:22[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(542208,)
[32m10-25 21:32:22[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、わかりました。少し考えてみます。
[32m10-25 21:32:22[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(164864,)
[32m10-25 21:32:22[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
ありがとうございます。今日はこのあたりで。また来週、同じ時間にお待ちしておりますね。
[32m10-25 21:32:22[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(286720,)


[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:45[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_20...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_20[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m154.812[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:28[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:04[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_20...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_20[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m139.489[0m seconds                                 


[32m10-25 21:38:21[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは、Bさん。今日は、これまでのセッションを振り返りましょうか。終結に向けての時期ですからね。
[32m10-25 21:38:22[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(300544,)
[32m10-25 21:38:22[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、そうですね。寂しさも少し、でもスッキリした部分もあって。
[32m10-25 21:38:22[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(218624,)
[32m10-25 21:38:22[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そうなんですね。寂しさも感じますか。スッキリしたというのは大きな変化。何か「変わったな」と感じることは？
[32m10-25 21:38:22[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(369152,)
[32m10-25 21:38:22[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
うーん、以前は漠然とした不安が常にあったんですけど…。
[32m10-25 21:38:22[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(172544,)
[32m10-

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:16[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_21...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_21[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m127.368[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:11[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:04[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_21...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_21[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m121.136[0m seconds                                 


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h[32m10-25 21:43:18[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
〇〇さん、こんにちは。今日はこれまでのセッションを振り返りながら、終結に向けて考えていけたらと思うのですが、いかがでしょうか。
[32m10-25 21:43:19[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(400896,)
[32m10-25 21:43:19[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、そうですね。もうそんな時期なんですね。
[32m10-25 21:43:19[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(164864,)
[32m10-25 21:43:19[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
ええ。ここまで色々取り組まれましたが、何か変化を感じることはありますか？
[32m10-25 21:43:19[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(247296,)
[32m10-25 21:43:19[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
漠然とした不安が減

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:17[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_22...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_22[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m127.781[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:13[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_22...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_22[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m123.636[0m seconds                                 


[32m10-25 21:48:14[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
Bさん、今日はこれまでのセッションを振り返りながら、今後のことについてお話しできたらと思うのですが、いかがでしょうか。
[32m10-25 21:48:14[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(371200,)
[32m10-25 21:48:14[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、お願いします。あっという間でしたね。最初は漠然と不安で、何から話していいか分からなかったんですけど。
[32m10-25 21:48:15[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(375808,)
[32m10-25 21:48:15[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そうでしたね。ええ。そこからここまで、Bさんご自身、何か変化を感じることはありますか？
[32m10-25 21:48:15[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(301056,)
[32m10-25 21:48:15[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい。前は些細なことでクヨクヨしていましたが、最近は「なんとかなる」と思えることが増えました。気持ちが楽になったんです。
[32m10-25 21:48:15[0m |[1m  INFO  [0m| tts_model.py:324

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:16[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:06[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_23...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_23[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m127.926[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:15[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_23...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_23[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m126.900[0m seconds                                 


[32m10-25 21:53:09[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
〇〇さん、今日はよくお越しくださいましたね。どうぞ楽にしてください。今日はどんなことをお話ししたい気分ですか？
[32m10-25 21:53:09[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(351232,)
[32m10-25 21:53:09[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
ありがとうございます。なんだか最近、仕事もプライベートも、全部がうまくいってない気がして...漠然とした焦りばかり感じてしまうんです。
[32m10-25 21:53:10[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(444416,)
[32m10-25 21:53:10[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
ええ、そうなんですね。漠然とした焦りを感じていらっしゃるんですね。具体的にどんな時に、そう感じることが多いですか？
[32m10-25 21:53:10[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(408576,)
[32m10-25 21:53:10[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい。仕事では、周りの人がどんどん成果を出しているのを見ると、自分だけ取り残されているような気がして。家に帰っても、あれもこれもやらなきゃって気持ちばかりで、結局何も手につかない、みたいな日々



[32m10-25 21:53:12[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(1225216,)
[32m10-25 21:53:12[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、ぜひ。今日は少し気持ちが楽になりました。
[32m10-25 21:53:13[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(161280,)
[32m10-25 21:53:13[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
良かったです。では、次回の予約ですが、来週の同じ曜日、時間はいかがでしょうか？それまでに、もし何か感じたことや、メモしておきたいことがあれば、気軽に書き留めておいてくださいね。
[32m10-25 21:53:13[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(602624,)
[32m10-25 21:53:13[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、分かりました。来週もよろしくお願いします。
[32m10-25 21:53:13[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(183808,)


[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:32[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_24...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_24[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m145.626[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:41[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_24...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_24[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m153.116[0m seconds                                 


[32m10-25 21:59:05[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは、Bさん。今日はどんな感じでお越しになりましたか？
[32m10-25 21:59:05[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(176640,)
[32m10-25 21:59:05[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは。最近、「こうすべき」という考えに縛られて、少し息苦しいんです。
[32m10-25 21:59:05[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(242688,)
[32m10-25 21:59:05[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
「こうすべき」ですか。なるほど。どんな時にそう感じますか？




[32m10-25 21:59:06[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(203776,)
[32m10-25 21:59:06[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
仕事で、もっと完璧にしなきゃと。ミスがあると、自分はダメだって落ち込んでしまうんです。
[32m10-25 21:59:06[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(289792,)
[32m10-25 21:59:06[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そうなんですね。完璧でなければ、と感じてしまう。そのお気持ち、よく分かります。
[32m10-25 21:59:06[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(301056,)
[32m10-25 21:59:06[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい。周りと比べて、自分だけができていないような焦りも感じてしまって。
[32m10-25 21:59:06[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(251904,)
[32m10-25 21:59:06[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
周りと比べて焦るお気持ち。この「〜すべき」という考えが、Bさんを苦しめているのかもしれませんね。




[32m10-25 21:59:07[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(345088,)
[32m10-25 21:59:07[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そうかもしれません。少し疲れてしまって…。
[32m10-25 21:59:07[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(147968,)
[32m10-25 21:59:07[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そうですよね。今日は、この「〜すべき」という考えが、Bさんの心にどんな影響を与えているのか、お話しいただけてありがたかったです。




[32m10-25 21:59:07[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(427008,)
[32m10-25 21:59:07[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こちらこそ、話せて少し楽になりました。
[32m10-25 21:59:07[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(142848,)
[32m10-25 21:59:07[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
よかったです。次回は、この「〜すべき」という考えについて、もう少しゆっくり見ていきましょうか。




[32m10-25 21:59:08[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(304640,)
[32m10-25 21:59:08[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、お願いします。
[32m10-25 21:59:08[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(78848,)
[32m10-25 21:59:08[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
承知しました。では、今日はここまでに。次回は来週の同じ時間でよろしいでしょうか？
[32m10-25 21:59:08[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(273920,)
[32m10-25 21:59:08[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、大丈夫です。
[32m10-25 21:59:08[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(74752,)
[32m10-25 21:59:08[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
ありがとうございます。お気をつけてお帰りくださいね。
[32m10-25 21:59:09[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated su

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:15[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_25...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_25[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m125.973[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:12[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_25...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_25[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m127.046[0m seconds                                 


[32m10-25 22:04:14[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは。Bさん、今日は前回少し触れた「〜すべき」というお気持ちについて、お話しできたらと思いますが、いかがでしょうか？




[32m10-25 22:04:14[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(402432,)
[32m10-25 22:04:14[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、最近「完璧にすべきだ」という気持ちが強くて、なんだか疲れてしまうんです。
[32m10-25 22:04:14[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(262656,)
[32m10-25 22:04:14[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
なるほど。「完璧にすべきだ」と。ええ、どんな時にそう感じますか？
[32m10-25 22:04:15[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(248832,)
[32m10-25 22:04:15[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
仕事でミスなく効率的に、とか。家でも「もっと綺麗にすべき」って考えてしまって。
[32m10-25 22:04:15[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(274944,)
[32m10-25 22:04:15[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
ええ、そうなんですね。それがBさんをどんな気持ちにさせますか？
[32m10-25 22:04:15[0m |[



[32m10-25 22:04:16[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(256512,)
[32m10-25 22:04:16[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
楽になれる気はします。でも、それが許されるのか…と。
[32m10-25 22:04:16[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(186368,)
[32m10-25 22:04:16[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
ええ、そのお気持ちも分かります。その「〜すべき」という考えが、Bさんを苦しめているのかもしれません。




[32m10-25 22:04:16[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(347136,)
[32m10-25 22:04:16[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
苦しめている、ですか。当たり前だと思っていました。
[32m10-25 22:04:16[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(164864,)
[32m10-25 22:04:16[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そうですよね。でも、その考えを少し緩めることで、心はもっと自由になれるかもしれません。今日はその可能性に触れられましたね。
[32m10-25 22:04:17[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(453120,)
[32m10-25 22:04:17[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい。新しい見方だなと思いました。
[32m10-25 22:04:17[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(135680,)
[32m10-25 22:04:17[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
ええ、ありがとうございます。今日の気づきを心に留めて、次回お聞かせください。今日はここまでとしましょう。
[32m10-25 22

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:17[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:04[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_26...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_26[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m133.167[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:16[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_26...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_26[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m133.504[0m seconds                                 


[32m10-25 22:09:30[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは。今日は、お話しにお越しいただきありがとうございます。どのようなことでお困りでしょうか？
[32m10-25 22:09:31[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(303616,)
[32m10-25 22:09:31[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、こんにちは。最近、漠然とした不安や焦りを感じていて…。仕事も日常生活も、どうもすっきりしないんです。
[32m10-25 22:09:31[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(399360,)
[32m10-25 22:09:31[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そうなんですね。漠然とした不安や焦りを感じていらっしゃる。ええ、ありがとうございます。具体的に、どんな時にそう感じることが多いですか？
[32m10-25 22:09:31[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(461312,)
[32m10-25 22:09:31[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
うーん、仕事ではタスクに追われている感覚で。プライベートでも、休日にしっかり休めている感じがしなくて、常に何かに追われているような気持ちで…。
[32m10-25 22:09:31[0m |[1m  INF

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:25[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:06[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_27...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_27[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m141.853[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:23[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_27...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_27[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m141.103[0m seconds                                 


[32m10-25 22:14:48[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
〇〇さん、今日はどうぞよろしくお願いいたします。今日はどのようなことをお話ししたいと思われますか？
[32m10-25 22:14:48[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(302080,)
[32m10-25 22:14:48[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、よろしくお願いいたします。えっと、最近、仕事でちょっと行き詰まっている感じで、それだけじゃなくて、なんだか普段の生活でも漠然とした不安とか焦りを感じていて…何から話せばいいのか、という状況なんです。
[32m10-25 22:14:49[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(690688,)
[32m10-25 22:14:49[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そうなんですね。仕事のことだけでなく、日常生活でも不安や焦りを感じていらっしゃる、ということですね。ええ、まずは今感じていることを、〇〇さんのペースでゆっくりお聞かせいただけますか？
[32m10-25 22:14:49[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(601600,)
[32m10-25 22:14:49[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい。仕事では、新しいプロジェクトを任されて、プレッシャーを感じて

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:29[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_28...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_28[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m149.807[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:29[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_28...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_28[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m152.620[0m seconds                                 


[32m10-25 22:20:27[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは、Bさん。先週お話しいただいた、仕事でのプレッシャーについて、今日はもう少し詳しくお聞かせいただけますか？
[32m10-25 22:20:27[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(383488,)
[32m10-25 22:20:27[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、先生。最近、何をするにも『こうすべきだ』って考えてしまって、すごく息苦しいんです。
[32m10-25 22:20:27[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(276480,)
[32m10-25 22:20:27[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
ああ、『こうすべきだ』、ですか。なるほど、そうなんですね。具体的には、どのような時にそう感じることが多いですか？
[32m10-25 22:20:28[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(394240,)
[32m10-25 22:20:28[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
仕事でミスをしてはいけない、とか、常に完璧でなければならない、とか…家でも、もっと家事をきちんとすべきだ、って。
[32m10-25 22:20:28[0m |[1m  INFO  [0m| tts_model.py:324 



[32m10-25 22:20:28[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(573440,)
[32m10-25 22:20:28[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
いつも追い詰められているような気持ちになって、疲れてしまいます。少しでもできないと、自分はダメだって…
[32m10-25 22:20:28[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(332800,)
[32m10-25 22:20:28[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、お辛いですね。そうですね、常に完璧を目指すというのは、本当に大変なことですよね。もし、その『完璧でなければならない』という考えが、少しだけ緩んだとしたら、Bさんはどう感じられるでしょうか？
[32m10-25 22:20:29[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(705024,)
[32m10-25 22:20:29[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
え…？緩む、ですか。でも、緩めたらもっとダメになってしまうんじゃないかと…
[32m10-25 22:20:29[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(246784,)
[32m10-25 22:20:29[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio 

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h[?25l

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:24[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_29...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_29[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m145.305[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:27[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:06[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_29...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_29[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m151.583[0m seconds                                 


[32m10-25 22:26:16[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは、〇〇さん。今日はようこそお越しくださいました。ゆっくりお話しくださいね。
[32m10-25 22:26:16[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(251904,)
[32m10-25 22:26:16[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは。はい、ありがとうございます。なんだか最近、ずっと落ち着かなくて…。
[32m10-25 22:26:17[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(289280,)
[32m10-25 22:26:17[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そうなんですね。落ち着かない、どんな時にそう感じることが多いですか？
[32m10-25 22:26:17[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(228864,)
[32m10-25 22:26:17[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
ええ。仕事中もですが、家に帰ってからも漠然とした不安や焦りがずっとあって…。
[32m10-25 22:26:17[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(278016,)
[32m10-25 22:2

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:19[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_30...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_30[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m141.757[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:15[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:06[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_30...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_30[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m143.268[0m seconds                                 


[32m10-25 22:32:04[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは、Bさん。今日はどんなことをお話ししたい気分ですか？
[32m10-25 22:32:04[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(187904,)
[32m10-25 22:32:04[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは、先生。最近、「こうしなきゃいけない」って思うことが多くて、すごく負担なんです。
[32m10-25 22:32:05[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(280064,)
[32m10-25 22:32:05[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
「こうしなきゃいけない」ですか。ええ、どんな時にそう感じますか？
[32m10-25 22:32:05[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(205312,)
[32m10-25 22:32:05[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
仕事でも家でも、常に「〜すべき」と考えてしまって…。




[32m10-25 22:32:05[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(174080,)
[32m10-25 22:32:05[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
ええ、どんな風に感じますか？
[32m10-25 22:32:05[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(107520,)
[32m10-25 22:32:05[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
焦りますし、できないと「自分はダメだ」って自己嫌悪に陥ります。
[32m10-25 22:32:06[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(203776,)
[32m10-25 22:32:06[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
なるほど、「ダメだ」と感じてしまうのですね。ええ。その「〜すべき」という考えが、Bさんを苦しめているように聞こえます。




[32m10-25 22:32:06[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(409600,)
[32m10-25 22:32:06[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい。でも、周りに迷惑をかけるとか、評価が下がるのが怖くて。
[32m10-25 22:32:06[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(232960,)
[32m10-25 22:32:06[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
ええ、はい。優しい気持ちからなのですね。もし、少しその「〜すべき」を緩めてみたら、どうなると思いますか？




[32m10-25 22:32:06[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(360448,)
[32m10-25 22:32:06[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
不安はありますが、楽になれるなら、そういう考え方も…と、少し思いました。
[32m10-25 22:32:07[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(264192,)
[32m10-25 22:32:07[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そうですね、その「少し」の気持ちを大切にしましょう。今日のお話から、Bさんを苦しめる「〜すべき思考」と、その背景にある優しい気持ちが分かりました。はい。次回は、これらとどう向き合うか、考えていきましょう。




[32m10-25 22:32:07[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(743424,)
[32m10-25 22:32:07[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、お願いします。少し、気持ちが軽くなりました。
[32m10-25 22:32:07[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(190976,)
[32m10-25 22:32:07[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
よかったです。今日はお話ししてくださってありがとうございます。では、次回の予約ですが…。
[32m10-25 22:32:07[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(287744,)


[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:15[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:04[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_31...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_31[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m135.570[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:14[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:04[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_31...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_31[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m128.845[0m seconds                                 


[32m10-25 22:37:13[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは。今日はよくお越しくださいましたね。どうぞ、楽な姿勢でお座りください。
[32m10-25 22:37:13[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(272384,)
[32m10-25 22:37:13[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、ありがとうございます。なんだか、少し緊張しますね。
[32m10-25 22:37:14[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(209408,)
[32m10-25 22:37:14[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
ええ、そうですよね。初めての場所で、どんなことを話したらいいか、戸惑うこともあるかもしれませんね。今日は、まず、今お感じになっていることを、ゆっくりお聞かせいただければと思います。
[32m10-25 22:37:14[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(588800,)
[32m10-25 22:37:14[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい。最近、なんだか漠然とした不安と焦りがあって…。仕事もそうなんですけど、プライベートでも、何をしていても落ち着かないというか。
[32m10-25 22:37:14[0m |[1m  INFO  [0m| tts_mode



[32m10-25 22:37:15[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(515072,)
[32m10-25 22:37:15[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
ええ、そうなんですね。「うまく言えないんじゃないか」という認知があって、心臓がドキドキしたり手が震えたりする感情、そして、何も言えなくなってしまう行動…。はい、とてもよくわかります。そういう時、お家で過ごされている時も、その気持ちが残っていたりしますか？
[32m10-25 22:37:16[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(902656,)
[32m10-25 22:37:16[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そうですね…。会議が終わってからも、あの時もっとこう言えばよかったとか、ずっと考えてしまって。夜もなかなか眠れなくて、次の日も体が重いんです。
[32m10-25 22:37:16[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(517120,)
[32m10-25 22:37:16[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、そうなんですね。お仕事だけでなく、日常生活にも影響が出ているということでしたね。なるほど。今日はまず、そうやって漠然と感じていた不安が、少しずつ具体的に見えてきただけでも、大きな一歩だと思いますよ。
[32m10-25 22:37:16[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data gen

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:37[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_32...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_32[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m158.789[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:40[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_32...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_32[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m164.012[0m seconds                                 


[32m10-25 22:43:30[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
Bさん、こんにちは。最近、何か気になっていることはありますか？
[32m10-25 22:43:30[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(211456,)
[32m10-25 22:43:30[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは。ええ、なんだか『ちゃんとしなきゃ』って焦ってしまって、疲れてしまうんです。
[32m10-25 22:43:30[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(259072,)
[32m10-25 22:43:30[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
『ちゃんとしなきゃ』ですね。はい。具体的には、どんな時にそう感じますか？
[32m10-25 22:43:30[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(232960,)
[32m10-25 22:43:30[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
仕事も家事も『完璧にこなすべきだ』と思ってしまって。そうすると、何も手につかなくて…
[32m10-25 22:43:31[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(283136,)
[32m10-25 22:43

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:17[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:04[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_33...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_33[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m137.465[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:16[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:04[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_33...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_33[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m133.315[0m seconds                                 


[32m10-25 22:49:19[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは、Bさん。今日は、何かお話ししたいことはありますか？ええ。
[32m10-25 22:49:19[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(221696,)
[32m10-25 22:49:19[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、先生。「〜すべきだ」って思うことが多くて、それがプレッシャーなんです。仕事もプライベートも、もっとこうしなきゃって。




[32m10-25 22:49:20[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(400896,)
[32m10-25 22:49:20[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
なるほど。「〜すべきだ」と感じることが多いのですね。ええ、具体的に、どんな時にそう思われますか？




[32m10-25 22:49:20[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(337920,)
[32m10-25 22:49:20[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
仕事だと「もっと効率的に」、家では「家事をきちんと」。友達といても「気の利いたことを言うべきだ」って。
[32m10-25 22:49:20[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(343552,)
[32m10-25 22:49:20[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そうですか。様々な場面でそう感じるんですね。それはBさんにとって、どんな気持ちにつながっていますか？
[32m10-25 22:49:20[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(350720,)
[32m10-25 22:49:21[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、すごくしんどいです。完璧じゃないとダメって思って、何も手につかなかったり、自分を責めたりして…。
[32m10-25 22:49:21[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(337920,)
[32m10-25 22:49:21[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
ええ、お辛いですね。「完璧でなけれ



[32m10-25 22:49:21[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(178688,)
[32m10-25 22:49:21[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい。この「すべき思考」で、他に困ることはありますか？
[32m10-25 22:49:21[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(178176,)
[32m10-25 22:49:21[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
週末も「もっと有意義に」と、結局疲れて何もせず。人に頼るのも「自分で解決すべきだ」って思って。
[32m10-25 22:49:22[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(374784,)
[32m10-25 22:49:22[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
なるほど。すべき思考が、行動を妨げたり、頼ることを難しくしているんですね。ご自身の「すべき思考」に気づけたこと、大きな一歩ですよ。
[32m10-25 22:49:22[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(502784,)
[32m10-25 22:49:22[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
ありがとうございます。納得できました。少し気持ちが楽になった

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:25[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:06[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_34...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_34[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m145.880[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:22[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:04[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_34...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_34[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m134.469[0m seconds                                 


[32m10-25 22:54:53[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは、〇〇さん。本日はありがとうございます。今日はどんなことをお話ししたい気分ですか？
[32m10-25 22:54:53[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(272384,)
[32m10-25 22:54:53[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは。えっと、何から話したらいいか…最近、仕事もそうですが、漠然とした不安や焦りがずっとあって、落ち着かないんです。
[32m10-25 22:54:54[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(434176,)
[32m10-25 22:54:54[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そうなんですね。漠然とした不安や焦り、ですか。ええ、ありがとうございます。まずはその『漠然とした不安』について、もう少し聞かせてもらえますか？
[32m10-25 22:54:54[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(496640,)
[32m10-25 22:54:54[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい。なんだか、周りはちゃんとやっているのに自分だけ置いていかれている気がして…。仕事でも、これでいいのかなって考えて、休日もリラックスできないんです。
[32m10-25 22:54:54

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:28[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_35...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_35[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m149.311[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:33[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_35...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_35[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m142.794[0m seconds                                 


[32m10-25 23:00:25[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
今日で、もう〇回目のセッションになりますね。これまでのことを少し振り返ってみて、今どんなお気持ちですか？ ええ、このカウンセリングが始まってから、何か変化を感じることはありましたか？
[32m10-25 23:00:25[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(583680,)
[32m10-25 23:00:25[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そうですね、最初は本当に漠然とした不安ばかりで、どうしたらいいかわからなかったんですけど…。ええ。でも、先生とお話しするうちに、少しずつ自分の気持ちを整理できるようになってきた気がします。
[32m10-25 23:00:25[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(626176,)
[32m10-25 23:00:25[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そうなんですね。はい。漠然とした不安だったところが、整理できるようになってきた、ということですね。具体的に、どんなところでそう感じますか？
[32m10-25 23:00:26[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(476160,)
[32m10-25 23:00:26[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
例えば、以前は仕事で少しでもうまくいかない

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:33[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_36...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_36[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m144.288[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:36[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:04[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_36...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_36[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m146.409[0m seconds                                 


[32m10-25 23:05:56[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは、Bさん。今日はなんだか、少しお疲れのようにも見えますが、いかがですか？
[32m10-25 23:05:57[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(261120,)
[32m10-25 23:05:57[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは、A先生。はい、少し、考えすぎてしまっているかもしれません。最近、仕事でもプライベートでも、「こうあるべきだ」って、自分を追い詰めてしまっているような気がして。
[32m10-25 23:05:57[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(544768,)
[32m10-25 23:05:57[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そうなんですね。「こうあるべきだ」というお気持ちが強くなっているんですね。ええ、とてもよくわかります。どんな時に、特にそう感じることが多いですか？
[32m10-25 23:05:57[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(479232,)
[32m10-25 23:05:57[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
例えば、週末も「何か生産的なことをすべきだ」と思って、ずっと落ち着かないんです。ゆっくり休んでいても、「これでいいのかな」って不安になるというか…。


[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:30[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:04[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_37...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_37[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m153.005[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:29[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:04[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_37...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_37[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m139.843[0m seconds                                 


[32m10-25 23:11:26[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは、今日は来てくださってありがとうございます。何か、お話ししたいことがあって、お越しくださったんですよね。
[32m10-25 23:11:27[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(335360,)
[32m10-25 23:11:27[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、あの、最近、なんだか漠然とした不安というか、焦りを感じることが多くて。仕事もそうですし、プライベートでも、ずっとモヤモヤしているんです。
[32m10-25 23:11:27[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(500224,)
[32m10-25 23:11:27[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そうなんですね、漠然とした不安や焦り、ということでしたか。ええ、お話ししてくださり、ありがとうございます。どんな時に、特にそう感じられますか？
[32m10-25 23:11:27[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(507392,)
[32m10-25 23:11:27[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
仕事では、周りの期待に応えられていないんじゃないかとか、このままでいいのか、って考えてしまって。何か新しいことを始めたい気持ちもあるんですけど、結局、何

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:35[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:04[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_38...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_38[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m147.929[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:28[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_38...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_38[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m138.085[0m seconds                                 


[32m10-25 23:16:49[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
〇〇さん、今日はよくお越しくださいました。どうぞ楽になさってくださいね。何か、最近気になることなど、お話しできる範囲で構いませんので、聞かせていただけますか？
[32m10-25 23:16:49[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(539648,)
[32m10-25 23:16:49[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、ありがとうございます。あの、最近、なんだか漠然とした不安というか、焦りを感じることが多くて…何から話せばいいのかも、正直よくわからなくて…
[32m10-25 23:16:49[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(564736,)
[32m10-25 23:16:49[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そうなんですね。漠然とした不安や焦り、感じていらっしゃるのですね。ええ、大丈夫ですよ、何からでも構いません。思いつくままに、お話しいただけますか？
[32m10-25 23:16:49[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(538624,)
[32m10-25 23:16:49[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい…。仕事では、今までと同じようにやっているつもりなんですけど、なんだか効率が落ちたような気がして。

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:33[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:04[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_39...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_39[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m144.002[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:32[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:04[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_39...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_39[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m140.753[0m seconds                                 


[32m10-25 23:22:46[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは、Bさん。今日はありがとうございます。何か、お話ししたいことはありますか？
[32m10-25 23:22:47[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(261120,)
[32m10-25 23:22:47[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、こんにちは。最近、仕事で頭がいっぱいで、なんだか落ち着かないんです。
[32m10-25 23:22:47[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(265216,)
[32m10-25 23:22:47[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そうなんですね。仕事が離れないと。ええ、どんな時にそう感じますか？
[32m10-25 23:22:47[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(258560,)
[32m10-25 23:22:47[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
仕事中はもちろん、家でも休日でも。常に何かに追われている感覚です。
[32m10-25 23:22:47[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(250880,)
[32m10-25 23:22:47[0m

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:14[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_40...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_40[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m137.016[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:14[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:04[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_40...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_40[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m130.303[0m seconds                                 


[32m10-25 23:27:51[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
Bさん、こんにちは。今日は前回お話しした「漠然とした焦り」について、もう少し詳しくお伺いしてもいいですか？
[32m10-25 23:27:51[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(366592,)
[32m10-25 23:27:51[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、もちろんです。最近、本当に何もかもが「もっとこうすべきだ」と感じてしまって…。
[32m10-25 23:27:52[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(272384,)
[32m10-25 23:27:52[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
ええ、そうなんですね。「もっとこうすべきだ」と感じる…、具体的にはどのような時にそう思われますか？
[32m10-25 23:27:52[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(349696,)
[32m10-25 23:27:52[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
例えば、仕事では「もっと早く終わらせるべき」とか、家では「もっと部屋をきれいにするべき」とか…。もう、常に何かを「すべき」って考えている気がします。
[32m10-25 23:27:52[0m |[1m  INFO  [0m| tts_model.py:

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:32[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:04[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_41...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_41[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m153.795[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:31[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:04[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_41...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_41[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m142.139[0m seconds                                 


[32m10-25 23:33:28[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは、今日はどうぞよろしくお願いします。今日は、まずお話しをゆっくり聞かせていただきたいなと思っています。何か、今お困りのことや、話してみたいことなどありますか？
[32m10-25 23:33:28[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(554496,)
[32m10-25 23:33:28[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、よろしくお願いします。実は最近、仕事のこともそうですし、なんだか日常生活全般で、こう漠然とした不安というか、焦りみたいなものを感じていて...。何から話したらいいか、という感じなんですけど。
[32m10-25 23:33:28[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(687104,)
[32m10-25 23:33:28[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そうなんですね。ええ、漠然とした不安や焦りを感じていらっしゃるんですね。ありがとうございます、話してくださって。そういった漠然とした、というお気持ちは、どんな時に特に感じられますか？何かきっかけのようなものがあったりしますか？
[32m10-25 23:33:29[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(775680,)
[32m10-25 23:33:29[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating au

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:34[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:04[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_42...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_42[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m145.030[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:40[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:04[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_42...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_42[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m149.704[0m seconds                                 


[32m10-25 23:38:56[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
〇〇さん、これまで数回にわたってセッションを重ねてきましたが、いよいよ終結に向けての準備の時期に入ってきましたね。これまでの時間を少し振り返ってみて、今どんなお気持ちですか？
[32m10-25 23:38:56[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(578560,)
[32m10-25 23:38:56[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そうですね…なんだかあっという間だったような気がします。最初は本当に漠然とした不安ばかりで、どうしていいか分からなかったんですけど…
[32m10-25 23:38:57[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(425472,)
[32m10-25 23:38:57[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、そうでしたね。漠然とした不安を感じていらっしゃいましたもんね。ええ。その中で、〇〇さんご自身で、何か変化を感じることはありましたか？
[32m10-25 23:38:57[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(456192,)
[32m10-25 23:38:57[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい。例えば、以前は職場の人間関係で何かあると、ずっと引きずってしまって、何も手につかなかったんですが、最

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:34[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:04[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_43...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_43[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m143.268[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:31[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:04[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_43...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_43[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m141.741[0m seconds                                 


[32m10-25 23:44:45[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは。今日はどうされましたか？ゆっくりお話聞かせてもらえますか？
[32m10-25 23:44:46[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(239616,)
[32m10-25 23:44:46[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは…最近、仕事で漠然とした不安がずっとあって、落ち着かないんです。
[32m10-25 23:44:46[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(252416,)
[32m10-25 23:44:46[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そうなんですね。お仕事の不安、ええ。どんな時にそう感じますか？
[32m10-25 23:44:46[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(227840,)
[32m10-25 23:44:46[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい。朝から憂鬱で、仕事も集中できず、夜も寝付けなくて…。
[32m10-25 23:44:46[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(252416,)
[32m10-25 23:44:46[0m |[1m  INFO 

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:13[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:04[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_44...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_44[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m123.901[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:11[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_44...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_44[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m121.095[0m seconds                                 


[32m10-25 23:49:27[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
〇〇さん、今日はこれまでのセッションを少し振り返ってみませんか。このカウンセリングを通じて、どんな変化や気づきがありましたか？
[32m10-25 23:49:27[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(386560,)
[32m10-25 23:49:27[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そうですね…。最初は漠然とした不安ばかりで、何から話せばいいのかもわからなかったんですけれど…本当に、もやもやしていました。
[32m10-25 23:49:27[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(435712,)
[32m10-25 23:49:27[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
ええ、はい、そうでしたね。最初は本当に、心の中に霧がかかっているようでしたよね。
[32m10-25 23:49:27[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(302080,)
[32m10-25 23:49:27[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい。でも、先生とお話しするうちに、自分が何に不安を感じているのか、少しずつ具体的に見えてきた気がします。
[32m10-25 23:49:28[0m |[1m  INFO  [0m| tts_model.py:

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:28[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:04[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_45...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_45[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m137.849[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:31[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:04[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_45...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_45[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m144.905[0m seconds                                 


[32m10-25 23:54:58[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは、〇〇さん。今日は来てくださってありがとうございます。少しでも楽にお話しできるよう、お手伝いさせていただきますね。
[32m10-25 23:54:58[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(380928,)
[32m10-25 23:54:58[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、よろしくお願いします。なんだか最近、ずっと気持ちが落ち着かなくて…漠然とした不安と焦りがあって、どうしたらいいのか分からなくて。
[32m10-25 23:54:59[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(459264,)
[32m10-25 23:54:59[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そうですか…ずっと気持ちが落ち着かなくて、漠然とした不安と焦りを感じていらっしゃるんですね。ええ、そういったお気持ち、とてもよく分かります。具体的に、どんな時にそう感じることが多いですか？
[32m10-25 23:54:59[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(671744,)
[32m10-25 23:54:59[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
仕事中は、常に締め切りに追われているような感覚で、ミスがないか、もっとできるんじゃないかって。家に帰って

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:02:47[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_46...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_46[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m280.174[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:46[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_46...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_46[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m158.689[0m seconds                                 


[32m10-26 00:02:50[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは、今日は来てくださってありがとうございます。少しお話を伺ってもよろしいでしょうか？
[32m10-26 00:02:50[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(285184,)
[32m10-26 00:02:50[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、よろしくお願いします。最近、なんだか漠然とした不安というか、焦りを感じることが多くて…
[32m10-26 00:02:51[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(336384,)
[32m10-26 00:02:51[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そうなんですね。漠然とした不安や焦りを感じていらっしゃるんですね。ええ、もう少し詳しくお聞かせいただけますか？
[32m10-26 00:02:51[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(365056,)
[32m10-26 00:02:51[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい。仕事でもプライベートでも、何をしていても「これでいいのかな」とか、「もっと何かできるはずなのに」と思ってしまって。具体的に何が原因なのか、自分でもよくわからないんです。
[32m10-26 00:02:51[0m |[1m  INFO  [0

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:33[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:04[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_47...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_47[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m148.219[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:32[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:06[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_47...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_47[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m147.715[0m seconds                                 


[32m10-26 00:08:28[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは。今日はどんなことについてお話ししましょうか。何か気になっていることはありますか？
[32m10-26 00:08:28[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(286720,)
[32m10-26 00:08:28[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、なんだか最近、常に『〜しなきゃいけない』って考えてしまうんです。仕事でもプライベートでも、もっと頑張らなきゃって…
[32m10-26 00:08:29[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(384000,)
[32m10-26 00:08:29[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そうなんですね。常に『〜しなきゃいけない』と感じていらっしゃるんですね。ええ。もう少し詳しくお聞かせいただけますか？
[32m10-26 00:08:29[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(357376,)
[32m10-26 00:08:29[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
例えば、仕事では「この資料は完璧に作らないとダメだ」とか、休みの日でも「何か生産的なことをしないと時間がもったいない」とか…
[32m10-26 00:08:29[0m |[1m  INFO  [0m| tts_m

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:24[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_48...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_48[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m133.620[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:23[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:04[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_48...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_48[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m132.113[0m seconds                                 


[32m10-26 00:13:52[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは。今日はよくいらっしゃいましたね。今、どんなことを感じていますか？ゆっくりお聞かせください。
[32m10-26 00:13:52[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(330752,)
[32m10-26 00:13:52[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、最近どうも気分が晴れなくて。仕事もプライベートも、漠然とした焦りを感じています。
[32m10-26 00:13:52[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(320512,)
[32m10-26 00:13:52[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
漠然とした焦り、ということでしたか。ええ、それはお辛いですね。どんな時にそう感じますか？
[32m10-26 00:13:53[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(334336,)
[32m10-26 00:13:53[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
仕事中は、『もっとできるはずなのに』とか、『周りに遅れているんじゃないか』って。家に帰ってもモヤモヤが続いてしまって。
[32m10-26 00:13:53[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data gene

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:21[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:04[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_49...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_49[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m130.572[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:17[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:04[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_49...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_49[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m126.725[0m seconds                                 


[32m10-26 00:18:53[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
〇〇さん、こんにちは。今日はこれまでのセッションを振り返りながら、そろそろ終結に向けてのお話を少しずつしていきましょうか。いかがですか？
[32m10-26 00:18:53[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(444928,)
[32m10-26 00:18:53[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、お願いします。早いもので、もうそんな時期なんですね。あっという間でした。
[32m10-26 00:18:53[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(294912,)
[32m10-26 00:18:53[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
ええ、そうですね。初めてこちらにいらした頃と比べて、ご自身の中でどんな変化を感じていらっしゃいますか？
[32m10-26 00:18:54[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(360960,)
[32m10-26 00:18:54[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
うーん…以前は漠然とした不安ばかりで、仕事でも私生活でもどうしたらいいか分からなかったんですけど、今は少しずつ自分の気持ちを整理できるようになってきた気がします。
[32m10-26 00:18:54[0m |[1m  INFO

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:30[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:04[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_50...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_50[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m145.724[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:34[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:04[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_50...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_50[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m142.861[0m seconds                                 


[32m10-26 00:24:52[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
〇〇さん、こんにちは。今日はどんなことをお話ししたいですか？
[32m10-26 00:24:52[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(191488,)
[32m10-26 00:24:52[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは。漠然とした不安や焦りがあって…少し話せたらと。
[32m10-26 00:24:52[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(235008,)
[32m10-26 00:24:52[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
不安や焦り、ですか。そうなんですね。どんな時に感じることが多いですか？
[32m10-26 00:24:53[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(251392,)
[32m10-26 00:24:53[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
仕事中も、家でも落ち着かなくて…何をしてても「これでいいのかな」って。
[32m10-26 00:24:53[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(279552,)
[32m10-26 00:24:53[0m |[1m  INFO  [

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:17[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_51...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_51[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m127.612[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:12[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:04[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_51...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_51[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m125.968[0m seconds                                 


[32m10-26 00:30:18[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
〇〇さん、こんにちは。今日はセッションを振り返り、今後についてお話しできればと思います。
[32m10-26 00:30:18[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(289792,)
[32m10-26 00:30:18[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい。もうそんな時期なんですね。少し寂しい気持ちです。
[32m10-26 00:30:18[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(211456,)
[32m10-26 00:30:18[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
ええ。この期間、ご自身でどんな変化を感じていますか？
[32m10-26 00:30:18[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(183808,)
[32m10-26 00:30:18[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
漠然とした不安が、何に困っているか整理できた気がします。
[32m10-26 00:30:19[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(198144,)
[32m10-26 00:30:19[0m |[1m  INFO  [0m| 

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:12[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_52...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_52[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m124.057[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:09[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_52...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_52[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m119.016[0m seconds                                 


[32m10-26 00:35:39[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
〇〇さん、前回のカウンセリングから、『〜すべき』と感じたことについて、何かお話しいただけますか？




[32m10-26 00:35:39[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(292864,)
[32m10-26 00:35:39[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい。『もっと頑張るべき』とか、『完璧にすべき』って、いつも頭にあります。そうしないと、周りに迷惑をかけるんじゃないかと…。
[32m10-26 00:35:40[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(399872,)
[32m10-26 00:35:40[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
なるほど、『もっと頑張るべき』『完璧にすべき』という考えが、常に〇〇さんの中にあるんですね。どんな時に特に強く感じますか？
[32m10-26 00:35:40[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(404992,)
[32m10-26 00:35:40[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
仕事や家事で、きちんとやらなきゃと思う時です。少しでも手が抜けたら、もうダメだと落ち込んでしまって…。
[32m10-26 00:35:40[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(337408,)
[32m10-26 00:35:40[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from t



[32m10-26 00:35:42[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(560640,)
[32m10-26 00:35:42[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、大丈夫です。よろしくお願いします。
[32m10-26 00:35:43[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(145408,)
[32m10-26 00:35:43[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、こちらこそ。今日はありがとうございました。
[32m10-26 00:35:43[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(152576,)


[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:21[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_53...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_53[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m132.208[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:22[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:04[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_53...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_53[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m135.078[0m seconds                                 


[32m10-26 00:40:59[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは、〇〇さん。今日は来てくださってありがとうございます。どうぞ楽にしてくださいね。ええ、今日はどんなお話しをされますか？
[32m10-26 00:40:59[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(391168,)
[32m10-26 00:40:59[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは。はい、ありがとうございます。えっと…最近、なんだか漠然と不安な気持ちが続いていて、どうしたらいいのか分からなくて…。
[32m10-26 00:40:59[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(475648,)
[32m10-26 00:40:59[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
漠然とした不安、なのですね。ええ、ありがとうございます。そうしたお気持ちが続いていらっしゃるのですね。もう少し詳しく、どんな時にそう感じることが多いですか？
[32m10-26 00:41:00[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(555008,)
[32m10-26 00:41:00[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そうですね…特に仕事中はそう感じることが多いです。いつも時間に追われている感じで、焦ってしまうというか。家でも、次に何をすればいいか分から

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:37[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:04[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_54...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_54[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m145.413[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:39[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:04[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_54...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_54[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m149.257[0m seconds                                 


[32m10-26 00:46:29[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
〇〇さん、こんにちは。今日はこれまでのカウンセリングを振り返りながら、そろそろ終結に向けてのお話ができればと思うのですが、いかがでしょうか？
[32m10-26 00:46:29[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(451584,)
[32m10-26 00:46:29[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、承知しています。なんだかあっという間だったような、でもすごく濃い時間だったような気がします。
[32m10-26 00:46:30[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(317440,)
[32m10-26 00:46:30[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そうなんですね。この数ヶ月、〇〇さんが感じていらっしゃった漠然とした不安や焦りについて、色々な角度からお話ししてきましたね。ええ、ご自身の中で、何か変化を感じることはありますか？
[32m10-26 00:46:30[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(621568,)
[32m10-26 00:46:30[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい。以前は、職場の人間関係とか、将来のこととか、考え出すと悪い方にばかり考えてしまって、どうしようもなかったんですけど…今は、少し冷



[32m10-26 00:46:30[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(423424,)
[32m10-26 00:46:30[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
例えば、職場でちょっとしたミスがあった時、前なら「また失敗した、自分はダメだ」ってすぐ落ち込んでたんですけど、今は「これは失敗だけど、次にどう活かそうか」って、感情的になる前に考えられるようになったというか。
[32m10-26 00:46:31[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(659456,)
[32m10-26 00:46:31[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
なるほど。以前の「ダメだ」という認知から、改善策を考える行動に繋がっているのですね。はい、それは素晴らしいことですね。ご自身でも、そうした変化を実感されているのですね。
[32m10-26 00:46:31[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(615424,)
[32m10-26 00:46:31[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい。完全に不安がなくなったわけじゃないですけど、前よりずっと楽になりました。ただ、この状態を自分で維持できるか、少し心配な気持ちもあります。
[32m10-26 00:46:31[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(478208,)
[32m10-26 

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:33[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_55...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_55[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m156.248[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:36[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_55...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_55[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m158.674[0m seconds                                 


[32m10-26 00:53:06[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
Bさん、こんにちは。今日はどうですか、お変わりありませんでしたか？
[32m10-26 00:53:07[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(223744,)
[32m10-26 00:53:07[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは。毎日『〜すべき』って焦ってしまって。仕事も家事も、できてない自分にモヤモヤします。
[32m10-26 00:53:07[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(323584,)
[32m10-26 00:53:07[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
『〜すべき』というお気持ちで焦ってしまうのですね。はい、ええ。




[32m10-26 00:53:07[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(186880,)
[32m10-26 00:53:07[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
ええ。休日に家事を完璧に『すべき』なのに、疲れてしまって。そうすると、ダメだって。
[32m10-26 00:53:07[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(271360,)
[32m10-26 00:53:07[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
できないとご自身を責めてしまうのですね。なるほど。
[32m10-26 00:53:08[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(174080,)
[32m10-26 00:53:08[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
友達との会話でも、『もっと面白く言うべき』って思って、結局何も話せないこともあって…。
[32m10-26 00:53:08[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(293376,)
[32m10-26 00:53:08[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
人間関係でも『〜すべき』が浮かぶのですね。その時、どんなお気持ちに？
[32m10-26 00:53:08[0m |

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:16[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_56...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_56[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m133.753[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:15[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_56...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_56[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m134.713[0m seconds                                 


[32m10-26 00:58:48[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは、Bさん。今日はよくお越しくださいました。どのようなお悩みですか？
[32m10-26 00:58:49[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(242688,)
[32m10-26 00:58:49[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは。最近、仕事もそうですが、漠然と将来が不安で、焦りを感じていて...。
[32m10-26 00:58:49[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(293376,)
[32m10-26 00:58:49[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そうなんですね。漠然とした不安や焦り、お辛いですね。もう少しお聞かせください。
[32m10-26 00:58:49[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(298496,)
[32m10-26 00:58:49[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい。仕事は忙しいのに、これでいいのかって。日々に追われ、やりたいことができてない気がして。
[32m10-26 00:58:49[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(327680,)
[32m

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:19[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:06[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_57...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_57[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m140.828[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:18[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_57...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_57[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m129.709[0m seconds                                 


[32m10-26 01:03:58[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは、今日はよくお越しくださいましたね。何か、今お話ししたいことなどありますでしょうか？
[32m10-26 01:03:58[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(293888,)
[32m10-26 01:03:58[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、こんにちは。最近、なんだか漠然とした不安というか、焦りのようなものを感じていて…。仕事のこともそうなんですけど、日常生活でも気分が晴れないことが多くて。
[32m10-26 01:03:59[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(580096,)
[32m10-26 01:03:59[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そうなんですね。ええ、漠然とした不安や焦り、ですか。それは、いつ頃から感じられるようになりましたか？
[32m10-26 01:03:59[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(369152,)
[32m10-26 01:03:59[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
いつから、と言われると難しいんですけど、半年くらい前からでしょうか。仕事で新しいプロジェクトが始まって、そのプレッシャーもあってか、なんとなくずっと落ち着かないんです。
[32m10-26 01:

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:33[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_58...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_58[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m153.190[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:34[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_58...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_58[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m144.871[0m seconds                                 


[32m10-26 01:09:36[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
〇〇さん、こんにちは。今日は前回お話しいただいた、漠然とした不安について、もう少しお聞かせいただけますか？
[32m10-26 01:09:36[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(372224,)
[32m10-26 01:09:36[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、なんだか常に「もっとこうするべきだ」とか「これは完璧にやるべきだ」という気持ちに追い立てられているような気がして…それがしんどいんです。特に仕事でも、もっとできるはずなのに、と自分を責めてしまいます。
[32m10-26 01:09:36[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(667136,)
[32m10-26 01:09:36[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そうなんですね。常に「もっとこうするべきだ」「完璧にやるべきだ」と感じていらっしゃるんですね。ええ、お辛いお気持ち、よくわかりますよ。例えば、どんな時にそう強く感じられるのでしょう？
[32m10-26 01:09:37[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(620032,)
[32m10-26 01:09:37[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
最近だと、資料作成で時間をかけすぎた時ですかね。もっと早



[32m10-26 01:09:37[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(763392,)
[32m10-26 01:09:37[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
だって、周りの人はみんなもっとテキパキやってるように見えるし、自分だけが要領悪いんじゃないかって。完璧にできないと、評価もされないんじゃないかと思ってしまうんです。
[32m10-26 01:09:37[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(472576,)
[32m10-26 01:09:37[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
ええ、そうなんですね。周りの方と比べてしまって、「完璧にできないと評価されない」と感じてしまうのですね。そういった考えが浮かんでくると、〇〇さんはどんなお気持ちになりますか？
[32m10-26 01:09:38[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(574976,)
[32m10-26 01:09:38[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
すごく焦りますし、不安になります。結果的に、余計に時間がかかってしまったり、手が進まなくなったりして、もっと落ち込むんです。
[32m10-26 01:09:38[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(377856,)
[32m10-26 01:09:38[0m |[1m  INFO  [



[32m10-26 01:09:39[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(1121792,)
[32m10-26 01:09:39[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、試してみます。なんだか、少し頭が整理された気がします。
[32m10-26 01:09:39[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(224256,)
[32m10-26 01:09:39[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
それは良かったです。はい。では、今日はここまでとしましょうか。次回の予約ですが、いつ頃がよろしいでしょうか？
[32m10-26 01:09:39[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(366080,)


[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:38[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_59...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_59[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m146.988[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h[?25l

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:39[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:06[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_59...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_59[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m150.324[0m seconds                                 


[32m10-26 01:15:22[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは、〇〇さん。今日は来てくださってありがとうございます。少しお話しにくいこともあるかもしれませんが、ゆっくりお聞かせくださいね。
[32m10-26 01:15:22[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(408064,)
[32m10-26 01:15:22[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、ありがとうございます。なんだか最近、ずっと気持ちが落ち着かなくて…漠然とした不安があるんです。
[32m10-26 01:15:22[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(359424,)
[32m10-26 01:15:22[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そうなんですね。漠然とした不安を感じていらっしゃるんですね。ええ、具体的に、どんな時にそう感じることが多いですか？
[32m10-26 01:15:23[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(391680,)
[32m10-26 01:15:23[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
えっと、仕事をしている時もそうなんですが、休日でも、なんだか常に焦っているような…このままでいいのかな、って考えてしまって。
[32m10-26 01:15:23[0m |[1m  INFO  

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:26[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_60...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_60[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m137.521[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:24[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_60...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_60[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m141.911[0m seconds                                 


[32m10-26 01:20:49[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは、〇〇さん。最近の調子はいかがですか？何か気になっていることはありますか？
[32m10-26 01:20:49[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(260096,)
[32m10-26 01:20:49[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは、先生。最近も仕事で「もっとこうすべきだ」とか、「完璧にこなさなきゃ」という気持ちが強くて、それが結構しんどいなと感じています。
[32m10-26 01:20:50[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(443904,)
[32m10-26 01:20:50[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
ええ、そうなんですね。「こうすべきだ」「完璧にこなさなきゃ」と、しんどい気持ちなのですね。
[32m10-26 01:20:50[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(328704,)
[32m10-26 01:20:50[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい。常にそう思っていないと、ダメなんじゃないかって。少しでも手を抜くと、いけないような気がして…。
[32m10-26 01:20:50[0m |[1m  INFO  [0m| tts_model.py:324 | Audio 

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:24[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:04[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_61...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_61[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m133.900[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:24[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_61...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_61[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m138.637[0m seconds                                 


[32m10-26 01:26:38[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
〇〇さん、今日はこれまでのセッションを振り返り、終結に向けてお話しできたらと思います。
[32m10-26 01:26:38[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(271872,)
[32m10-26 01:26:38[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい。あっという間でしたが、色々変わったなと思います。
[32m10-26 01:26:38[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(176640,)
[32m10-26 01:26:39[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
ええ、そうですか。ご自身の中で、どんな変化を感じていますか？
[32m10-26 01:26:39[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(223232,)
[32m10-26 01:26:39[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
漠然とした不安ばかりでしたが、今は自分の感情に気づけるのが大きいですね。前は焦っていたのが、『これが原因でこう感じる』と冷静に分析できるようになりました。
[32m10-26 01:26:39[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:15[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:06[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_62...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_62[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m124.614[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:12[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_62...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_62[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m125.888[0m seconds                                 


[32m10-26 01:31:44[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは。Bさん、今日はどんなことをお話ししたいですか？ご様子はいかがでしたか？
[32m10-26 01:31:44[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(279040,)
[32m10-26 01:31:44[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは。最近、「もっとちゃんとしなきゃ」という気持ちが強くて、落ち着かないんです。
[32m10-26 01:31:44[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(240640,)
[32m10-26 01:31:44[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
「もっとちゃんとしなきゃ」、ですか。ええ、そうなんですね。どんな時にそう感じますか？




[32m10-26 01:31:45[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(281600,)
[32m10-26 01:31:45[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
仕事で完璧に、家事も友だちへの連絡もすぐに返さなきゃ、って。
[32m10-26 01:31:45[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(205824,)
[32m10-26 01:31:45[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
なるほど。その「〜しなきゃ」が強い時、どんな気持ちになりますか？




[32m10-26 01:31:45[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(219648,)
[32m10-26 01:31:45[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
焦って、もしできなかったらどう思われるか不安になります。
[32m10-26 01:31:45[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(172032,)
[32m10-26 01:31:45[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
焦りや不安ですね。もし、完璧じゃなくても、すぐに返せなくても、何か困ることはありますか？
[32m10-26 01:31:46[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(335872,)
[32m10-26 01:31:46[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
うーん…実際は困らないのかも。でも、そうしなきゃって思ってしまうんです。
[32m10-26 01:31:46[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(248832,)
[32m10-26 01:31:46[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、そうですよね。では、もし「完璧じゃなくても大丈夫」と少し自分に許可を出せたら、どう感じそうですか？
[32m10-



[32m10-26 01:31:47[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(503296,)
[32m10-26 01:31:47[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
自分を追い詰めているのは、「〜しなきゃ」という気持ちだと気づけました。




[32m10-26 01:31:47[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(191488,)
[32m10-26 01:31:47[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そうなんですね。次回までに、もし「〜すべき」と感じた時に、ご自身の気持ちを少し観察してみていただけますか？




[32m10-26 01:31:47[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(375296,)
[32m10-26 01:31:47[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、やってみます。
[32m10-26 01:31:48[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(79872,)
[32m10-26 01:31:48[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
ありがとうございます。では、今日はここまでとしましょう。次回の予約を決めましょうね。
[32m10-26 01:31:48[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(279040,)


[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:20[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_63...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_63[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m132.842[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:14[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_63...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_63[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m126.547[0m seconds                                 


[32m10-26 01:36:39[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは、〇〇さん。今日は、どんなことでお越しになりましたか？お話しいただける範囲で、ゆっくりお聞かせくださいね。
[32m10-26 01:36:39[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(352768,)
[32m10-26 01:36:39[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、なんだか最近、うまくいかないことばかりで…仕事もそうなんですけど、なんだか毎日が不安で、焦る気持ちがあります。
[32m10-26 01:36:40[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(440832,)
[32m10-26 01:36:40[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そうなんですね。なんだか、お辛い気持ちを抱えていらっしゃるのですね。仕事のことも、そして、毎日の生活のことも、漠然とした不安や焦りを感じていらっしゃる、と。
[32m10-26 01:36:40[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(572416,)
[32m10-26 01:36:40[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
ええ、そうです。朝起きるのも億劫で、仕事に行っても集中できなくて。家に帰っても、何をする気にもなれないんです。
[32m10-26 01:36:40[0m |



[32m10-26 01:36:41[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(104448,)
[32m10-26 01:36:41[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
色々と、お心の中で抱えられていることが、たくさんあるようですね。今日は、そんな漠然とした不安や焦りの背景について、少しだけお話を聞かせていただけて、ありがとうございます。
[32m10-26 01:36:42[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(610816,)
[32m10-26 01:36:42[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こちらこそ、ありがとうございます。少し、話せてよかったです。
[32m10-26 01:36:42[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(181248,)
[32m10-26 01:36:42[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
ええ、嬉しいです。今日は、〇〇さんが今抱えていらっしゃる気持ちの大きな部分に触れることができたと思います。次回は、もう少し具体的に、この不安や焦りがどんな時に強く感じられるかなど、お話しできたらと思いますがいかがでしょうか？
[32m10-26 01:36:42[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(755200,)
[32m10-26 01:36:42[0m |[1m  INFO  [0m| 

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:23[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_64...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_64[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m137.458[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:26[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_64...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_64[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m138.311[0m seconds                                 


[32m10-26 01:41:54[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
〇〇さん、こんにちは。今日は、これまで数回お話を伺ってきましたが、そろそろ終結に向けて、これまでを少し振り返ってみませんか？
[32m10-26 01:41:55[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(449536,)
[32m10-26 01:41:55[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そうですね…。なんか、あっという間でしたけど、色々ありましたね。
[32m10-26 01:41:55[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(212480,)
[32m10-26 01:41:55[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
ええ、そうですね。このカウンセリングを通じて、〇〇さんの中で、何か変化したことや、気づかれたことなど、ございますか？
[32m10-26 01:41:55[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(390144,)
[32m10-26 01:41:55[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい。以前は漠然と「性格を直したい」って思ってたんですけど、集団面接で話せない自分にどうしていいか分からなくて。でも、先生と話していくうちに、あ、自分が「口下手だから失敗する」って勝手に思い込んで、それで不安になって、結局話さない、っていう悪循環

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:34[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_65...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_65[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m147.785[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:27[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_65...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_65[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m138.131[0m seconds                                 


[32m10-26 01:47:57[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは。今日はよくいらっしゃいましたね。お座りください。今日のご気分はいかがですか？
[32m10-26 01:47:58[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(286208,)
[32m10-26 01:47:58[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは。ありがとうございます。なんだかいつも心がざわざわしていて、落ち着かない感じです...。
[32m10-26 01:47:58[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(335872,)
[32m10-26 01:47:58[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そうなんですね、心がざわざわされるのですね。まずは今感じていること、どんなことでもお話しくださいね。
[32m10-26 01:47:58[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(377344,)
[32m10-26 01:47:58[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい...。仕事のことが一番大きいと思うんですけど、最近は家でも休日でも、常に何かに追われているような焦りを感じていて。漠然とした不安があります。
[32m10-26 01:47:58[0m |[1m  INFO  [0m| tts_model.py:32

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:25[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_66...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_66[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m139.021[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:20[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_66...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_66[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m133.339[0m seconds                                 


[32m10-26 01:53:07[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは、〇〇さん。今日は、〇〇さんのお話をゆっくりお聞かせいただきたいと思います。何か、今、お話ししたいことや、気になっていることがあれば、どんなことでも構いませんので、教えていただけますか？
[32m10-26 01:53:08[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(608256,)
[32m10-26 01:53:08[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい。ええと、最近、仕事のこともそうなんですけど、なんかこう、日常生活全体に漠然とした不安とか、焦りを感じていて…。何から話したらいいのか、自分でもよくわからなくて。
[32m10-26 01:53:08[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(612864,)
[32m10-26 01:53:08[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そうなんですね。はい、漠然とした不安や焦り、感じていらっしゃるのですね。ええ、何から話したらいいか分からない、というお気持ち、とてもよく分かりますよ。もしよかったら、どんな時に、その不安や焦りを感じることが多いですか？ 例えば、具体的な場面などあれば…。
[32m10-26 01:53:08[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(907264,)
[32m10-26 01:53:08[0m |[1m  INFO  [0m| tts_model.py:259 | Start 

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:45[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_67...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_67[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m156.684[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:43[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_67...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_67[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m156.946[0m seconds                                 


[32m10-26 01:58:56[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは、今日は来てくださってありがとうございます。どうぞ楽にしてくださいね。今日はどんなことをお話ししたい気分ですか？
[32m10-26 01:58:56[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(395776,)
[32m10-26 01:58:56[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは。ありがとうございます。えっと…何から話せばいいのか、漠然とした不安があるというか、焦りを感じていて…
[32m10-26 01:58:56[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(420864,)
[32m10-26 01:58:56[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そうなんですね。漠然とした不安と焦り、ですか。はい。具体的にどんな時に、そういった気持ちになることが多いですか？
[32m10-26 01:58:57[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(418304,)
[32m10-26 01:58:57[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
特にこれといったきっかけがあるわけじゃないんですけど、仕事でうまくいかないと「このままでいいのかな」って考えたり、家で一人でいる時も、ふと「何やってるんだろう」って思っちゃったりして…
[32m10-2

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:30[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_68...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_68[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m141.880[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:26[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_68...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_68[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m139.917[0m seconds                                 


[32m10-26 02:04:11[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
〇〇さん、今日はこれまでのセッションを振り返りながら、そろそろ終結に向けてのお話をしていきましょうか。いかがですか？
[32m10-26 02:04:12[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(360448,)
[32m10-26 02:04:12[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、なんだかあっという間でしたね。でも、少し寂しいような気もします。
[32m10-26 02:04:12[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(243712,)
[32m10-26 02:04:12[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
ええ、そうなんですね。寂しさを感じていらっしゃるのですね。この数ヶ月、〇〇さんがご自身と向き合ってこられた中で、何か特に印象に残っていることや、ご自身の変化について感じることはありますか？
[32m10-26 02:04:12[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(660992,)
[32m10-26 02:04:12[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そうですね…。最初は本当に漠然とした不安ばかりで、どうしたらいいか分からなかったんですけど、カウンセラーさんとお話ししていく中で、何が自分を不安にさせているのか、少しずつ整理で

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:27[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_69...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_69[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m138.039[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:26[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_69...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_69[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m138.975[0m seconds                                 


[32m10-26 02:09:59[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは、Bさん。今日は前回お話しいただいた「もっと完璧にしないと」というお気持ちについて、もう少し詳しくお伺いしてもよろしいでしょうか？
[32m10-26 02:09:59[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(442368,)
[32m10-26 02:09:59[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、先生。最近、特に仕事で「もっと完璧にしないと」っていつも考えてしまって、何をしていても焦りを感じるんです。
[32m10-26 02:10:00[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(391168,)
[32m10-26 02:10:00[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そうなんですね。その「完璧に」というお気持ち、どんな時に特に強く感じられますか？
[32m10-26 02:10:00[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(294912,)
[32m10-26 02:10:00[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
例えば資料作成だと、小さな誤字脱字一つでも許せなくて。「これじゃダメだ」って自分を追い詰めてしまって、提出も遅れて困っています。
[32m10-26 02:10:00[0m |[1m  INFO  [0m| t

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:27[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_70...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_70[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m138.268[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:26[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_70...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_70[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m139.105[0m seconds                                 


[32m10-26 02:15:25[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは、Bさん。今日はどんなお気持ちですか？
[32m10-26 02:15:25[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(145408,)
[32m10-26 02:15:25[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは、先生。なんだかモヤモヤしていて。やらなきゃいけないことがたくさんあるのに、全然できてないなって焦りを感じるんです。
[32m10-26 02:15:25[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(398336,)
[32m10-26 02:15:25[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そうなんですね。やらなきゃいけないことが多くて、ええ。それができていないと感じて焦りを感じていらっしゃる、と。
[32m10-26 02:15:25[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(348672,)
[32m10-26 02:15:26[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい。例えば、もっと早く仕事を終わらせて、家に帰ったらすぐご飯を作って、部屋もいつも綺麗にしておくべきだって思うのに、実際は追いつかなくて…。
[32m10-26 02:15:26[0m |[1m  INFO  [0m| tts_model.py:324 |



[32m10-26 02:15:26[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(268800,)
[32m10-26 02:15:26[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい。周りの人はみんなもっとちゃんとやっている気がして、自分だけができてないって思うと、余計に苦しくなってしまいます。
[32m10-26 02:15:26[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(392192,)
[32m10-26 02:15:26[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
ええ、そうなんですね。周りと比べて、ご自身に「〜すべき」という期待をかけていらっしゃる。その考え方は、Bさんにとって、どんなふうに感じられますか？




[32m10-26 02:15:27[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(492544,)
[32m10-26 02:15:27[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
うーん…楽には、ならないですね。むしろ、もっと追い詰められるような、息苦しい感じがします。
[32m10-26 02:15:27[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(334848,)
[32m10-26 02:15:27[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、息苦しく感じるんですね。無理に「〜すべき」とご自身を縛りつけると、心は休まらないかもしれませんね。今日は、その「〜すべき」が心にどう影響しているか、見つめ直すきっかけになったでしょうか。




[32m10-26 02:15:27[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(678400,)
[32m10-26 02:15:27[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そうですね…。自分を苦しめているのが、この「〜すべき」っていう考え方なのかもしれないって、少し気づけた気がします。




[32m10-26 02:15:28[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(362496,)
[32m10-26 02:15:28[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
ええ、素晴らしい気づきですね。次回は、この「〜すべき」から、もう少し楽になれるような見方について一緒に考えていきましょうか。来週の同じ時間でよろしいでしょうか？




[32m10-26 02:15:28[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(530432,)
[32m10-26 02:15:28[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、大丈夫です。よろしくお願いします。
[32m10-26 02:15:28[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(143872,)


[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:20[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_71...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_71[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m131.754[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:21[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_71...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_71[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m135.358[0m seconds                                 


[32m10-26 02:21:14[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは。今日はよくいらっしゃいましたね。今、何かお話ししたいこと、お聞かせいただけますか？
[32m10-26 02:21:14[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(316928,)
[32m10-26 02:21:14[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい。最近、仕事も生活も、漠然とした不安や焦りを感じています。
[32m10-26 02:21:14[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(246272,)
[32m10-26 02:21:14[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
ええ、そうなんですね。漠然とした不安、ですか。どんな時に感じることが多いですか？
[32m10-26 02:21:15[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(302080,)
[32m10-26 02:21:15[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
小さなことでも『これでいいのかな』と不安になり、先のことを考えると息苦しくなってしまいます。
[32m10-26 02:21:15[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(281088,)
[32

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:21[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_72...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_72[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m131.875[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:15[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_72...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_72[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m127.601[0m seconds                                 


[32m10-26 02:26:12[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
〇〇さん、こんにちは。今日は少し、これまでのセッションを振り返りながら、今後のことについてもお話しできたらと思うのですが、いかがでしょうか？
[32m10-26 02:26:12[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(445952,)
[32m10-26 02:26:12[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、こんにちは。そうですね、もうそんな時期なんですね。あっという間でした。
[32m10-26 02:26:12[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(286208,)
[32m10-26 02:26:12[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
ええ、そうですね。この数ヶ月、〇〇さんが本当にたくさんのことに取り組んでこられた時間でしたね。特に、最初にお話しされていた漠然とした不安や、仕事への焦りといった点で、何か変化を感じることはありますか？
[32m10-26 02:26:13[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(709120,)
[32m10-26 02:26:13[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
うーん、そうですね。以前は、本当に何をするにも不安で、特に仕事のことも、自分が何をしたいのかもわからなくて、ずっとモヤモヤしていたんで

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:38[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_73...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_73[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m153.977[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:40[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_73...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_73[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m154.300[0m seconds                                 


[32m10-26 02:32:05[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは、Bさん。先週お話しされていた「漠然とした焦り」について、今日はもう少しお伺いできたらと思います。いかがでしょうか？
[32m10-26 02:32:05[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(413184,)
[32m10-26 02:32:05[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、なんだか最近、何をしていても「こうすべきだ」って考えが頭から離れないんです。完璧にこなさなきゃ、って。
[32m10-26 02:32:06[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(380928,)
[32m10-26 02:32:06[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そうなんですね。「完璧にこなすべきだ」、と。ええ、なるほど。そういったお気持ちが、Bさんを少し息苦しくさせているように聞こえますね。
[32m10-26 02:32:06[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(491520,)
[32m10-26 02:32:06[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、本当に。例えば、仕事でちょっとしたミスでも、「なんでこんなこともできないんだろう」って、ずっと考えてしまって。
[32m10-26 02:32:06[0m |[1m  IN



[32m10-26 02:32:06[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(505344,)
[32m10-26 02:32:06[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
ええ、そうです。それが疲れる原因になっているのは分かっているんですが、止められないんです。
[32m10-26 02:32:07[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(257536,)
[32m10-26 02:32:07[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そうなんですね。止められない、と。はい。もしかしたら、その「〜すべき」という考え方が、Bさんの今の不安や焦りに繋がっているのかもしれませんね。




[32m10-26 02:32:07[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(494592,)
[32m10-26 02:32:07[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そう、かもしれません…。
[32m10-26 02:32:07[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(86528,)
[32m10-26 02:32:07[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
ええ。この「〜すべき」という考え方について、今日は少しお話しできましたね。次回は、この考え方がBさんにとってどんな意味を持っているのか、もう少し一緒に掘り下げていけたらと思いますが、いかがでしょうか？




[32m10-26 02:32:07[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(688128,)
[32m10-26 02:32:07[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、ぜひお願いします。
[32m10-26 02:32:08[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(87552,)
[32m10-26 02:32:08[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
ありがとうございます。では、本日はここまでとしましょう。お疲れ様でした。
[32m10-26 02:32:08[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(255488,)
[32m10-26 02:32:08[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
ありがとうございました。
[32m10-26 02:32:08[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(60928,)


[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:18[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_74...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_74[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m133.416[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:16[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_74...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_74[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m128.390[0m seconds                                 


[32m10-26 02:37:09[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
〇〇さん、こんにちは。今日はこれまでのセッションを少し振り返りながら、終結に向けてのお話もできればと思うのですが、いかがでしょうか？
[32m10-26 02:37:10[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(423936,)
[32m10-26 02:37:10[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは。はい、そうですね。あっという間でしたけど、色々と変わったなと感じています。
[32m10-26 02:37:10[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(295424,)
[32m10-26 02:37:10[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
ええ、そうなんですね。この数ヶ月、〇〇さんが本当に色々なことに向き合ってこられましたものね。特に、ご自身の中で「これは変わったな」と感じることはありますか？
[32m10-26 02:37:10[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(517632,)
[32m10-26 02:37:10[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
うーん…はい。以前は、仕事でミスをすると「自分はダメだ」って、すごく落ち込んでしまっていたんです。それが、最近は「次はどうすればいいかな」って、少し冷静に考えられるようになった

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:28[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_75...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_75[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m140.874[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:27[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_75...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_75[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m139.880[0m seconds                                 


[32m10-26 02:42:31[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
Bさん、こんにちは。今日はどのような感じでお過ごしでしたか？何か気になっていることなどありますか？
[32m10-26 02:42:31[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(322560,)
[32m10-26 02:42:31[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは。最近、もっと頑張らなきゃいけないのに、全然できてないって、ずっと考えてしまって…。会社でも家でも、『すべき』ことが山積みな気がして。
[32m10-26 02:42:32[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(482304,)
[32m10-26 02:42:32[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
なるほど、そうなんですね。『すべき』ことがたくさんある、と感じていらっしゃるのですね。ええ。
[32m10-26 02:42:32[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(292864,)
[32m10-26 02:42:32[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい。例えば、仕事でも『もっと早く終わらせるべきだった』とか、家でも『週末くらいは掃除を完璧にするべきだ』とか、つい考えてしまって。
[32m10-26 02:42:32[0m |[1m  INFO  [0m



[32m10-26 02:42:32[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(544256,)
[32m10-26 02:42:32[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
うーん…なんだか、いつも焦っているような、自分を責めているような気持ちになります。やらなきゃ、って思うのに、体が動かないというか…。
[32m10-26 02:42:33[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(434176,)
[32m10-26 02:42:33[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
なるほど、焦りやご自身を責めるお気持ちになるのですね。ええ、体が動かない感覚もあるのですね。
[32m10-26 02:42:33[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(353280,)
[32m10-26 02:42:33[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい。結局、何もできてない自分にまたがっかりして、また『もっと頑張るべきなのに』って、悪循環なのは分かっているんですけど…。
[32m10-26 02:42:33[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(407552,)
[32m10-26 02:42:33[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from t

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:25[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_76...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_76[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m139.550[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:22[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_76...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_76[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m132.070[0m seconds                                 


[32m10-26 02:47:51[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
〇〇さん、今日はこれまでのセッションを少し振り返りませんか。何か変化や気づきはありましたか？
[32m10-26 02:47:52[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(287744,)
[32m10-26 02:47:52[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そうですね、最初に来た頃に比べると、漠然とした不安が少し軽くなったような気がします。特に何がどう、って説明するのは難しいんですけど。
[32m10-26 02:47:52[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(463872,)
[32m10-26 02:47:52[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、漠然とした不安が軽くなったんですね。大切な気づきです。どんな時にそう感じられましたか？
[32m10-26 02:47:52[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(325632,)
[32m10-26 02:47:52[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
以前は仕事でミスをすると「もうダメだ」って落ち込んでたんですけど、最近は「まあ、そういうこともあるか」って思えるようになりました。
[32m10-26 02:47:53[0m |[1m  INFO  [0m| tts_mode

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:30[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_77...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_77[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m145.435[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:21[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_77...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_77[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m134.762[0m seconds                                 


[32m10-26 02:53:16[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは。Bさん、今日は少しお話しを聞かせていただけますか？最近、何か気になることや、心の中で感じていることはありますか？
[32m10-26 02:53:16[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(448512,)
[32m10-26 02:53:16[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、先生。最近、なんだかずっと気持ちが落ち着かなくて…。もっとこう、ちゃんとしないといけないのに、全然できていないような気がして。
[32m10-26 02:53:16[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(436224,)
[32m10-26 02:53:16[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そうなんですね。ちゃんとしないといけない、と感じていらっしゃるんですね。ええ、具体的には、どのような時にそう思われることが多いですか？
[32m10-26 02:53:16[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(424448,)
[32m10-26 02:53:16[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
仕事でも、家でも、やるべきことがたくさんあるはずなのに、なかなか手がつかなくて。周りの人はもっと効率的にこなしているのに、自分はダメだなって。
[32m10-



[32m10-26 02:53:17[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(452608,)
[32m10-26 02:53:17[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
ええ、たくさんの「〜べき」が頭の中にあるのですね。そうなんですね。もし、仮にですが、その「積極的に意見を言うべき」というのを、今日一日、少しだけ「言わない」という選択をしてみたら、Bさんにとってどんな気持ちになるでしょう？




[32m10-26 02:53:18[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(728576,)
[32m10-26 02:53:18[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
え、言わない選択…ですか。うーん、きっと不安になると思います。「また何も言えなかった」って、後悔しそうですし。
[32m10-26 02:53:18[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(397312,)
[32m10-26 02:53:18[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そうですね、不安や後悔を感じるかもしれませんね。はい。では、その「言わない」選択をしたとして、実際に周りの方や状況に、何か具体的な変化が起こると思いますか？
[32m10-26 02:53:18[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(583168,)
[32m10-26 02:53:18[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
うーん…。もしかしたら、そんなに大きな変化はないのかもしれません。誰もそこまで私の発言に注目しているわけではない、のかもしれませんね…。
[32m10-26 02:53:18[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(444928,)
[32m10-26 02:53:18[0m |[1m  INFO  [0m| tts_model.py:259 | Start g



[32m10-26 02:53:19[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(1097728,)
[32m10-26 02:53:19[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、そうかもしれません…。なんだか、少し考えさせられます。
[32m10-26 02:53:19[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(211456,)
[32m10-26 02:53:19[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
ええ、今日はここまでとしましょうか。次回の面接では、この「〜べき」というお気持ちについて、もう少し詳しく一緒に考えていけたらと思います。何か、今日の面接で気づいたことや、考えてみたいことがあれば、メモしておいていただけますか？




[32m10-26 02:53:19[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(780288,)
[32m10-26 02:53:19[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、わかりました。ありがとうございます。
[32m10-26 02:53:20[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(161792,)
[32m10-26 02:53:20[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
では、また来週お会いしましょう。
[32m10-26 02:53:20[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(104960,)


[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:38[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_78...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_78[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m151.950[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:44[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_78...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_78[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m155.912[0m seconds                                 


[32m10-26 02:59:01[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは、今日はよくお越しくださいました。どうぞ、楽になさってくださいね。
[32m10-26 02:59:02[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(261120,)
[32m10-26 02:59:02[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは。ありがとうございます。なんだか、少し緊張しています。
[32m10-26 02:59:02[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(233472,)
[32m10-26 02:59:02[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
ええ、そうですよね。初めてお話しされるのは、どなたでも緊張されるものですから。今日はどんなことをお話しいただけますか？無理のない範囲で、ゆっくりお聞かせくださいね。
[32m10-26 02:59:02[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(565248,)
[32m10-26 02:59:02[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい。最近、仕事のことで漠然とした不安があるのと、日常生活でもなんだか焦燥感を感じることが多くて…。
[32m10-26 02:59:03[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:32[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_79...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_79[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m144.184[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:36[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_79...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_79[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m145.511[0m seconds                                 


[32m10-26 03:04:34[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは、Bさん。今週はいかがお過ごしでしたか？
[32m10-26 03:04:35[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(157696,)
[32m10-26 03:04:35[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは、先生。うーん、なんだかバタバタして、あまり落ち着かなかった一週間でした。
[32m10-26 03:04:35[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(297472,)
[32m10-26 03:04:35[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そうでしたか。落ち着かない一週間だったんですね。
[32m10-26 03:04:35[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(154624,)
[32m10-26 03:04:35[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい。仕事でも、もっと効率よくこなすべきなのに、思うようにいかなくて。家でも、ちゃんと家事を完璧にやらなきゃって。
[32m10-26 03:04:36[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(389632,)
[32m10-26 03:04:36



[32m10-26 03:04:38[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(905216,)
[32m10-26 03:04:38[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、気づけて良かったです。
[32m10-26 03:04:38[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(102912,)
[32m10-26 03:04:38[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
ええ、素晴らしい気づきだと思います。次回は、この『〜すべき思考』が本当に『すべき』なのか、もう少し別の見方もできるのか、一緒に考えていきましょう。来週までに、『〜すべきだ』と思った時に、『今、〜すべきだと思ったな』とメモする簡単なホームワークはいかがでしょうか。気づくだけで構いません。




[32m10-26 03:04:39[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(964608,)
[32m10-26 03:04:39[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、やってみます。
[32m10-26 03:04:39[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(79360,)
[32m10-26 03:04:39[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
ありがとうございます。では、来週の同じ時間にお待ちしておりますね。
[32m10-26 03:04:39[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(219648,)
[32m10-26 03:04:39[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、ありがとうございました。
[32m10-26 03:04:39[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(102912,)


[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:30[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_80...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_80[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m145.306[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:35[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_80...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_80[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m146.371[0m seconds                                 


[32m10-26 03:10:21[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは、Bさん。今日はどんなご様子でしたか？何か、最近気になっていることはありますか？
[32m10-26 03:10:21[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(293888,)
[32m10-26 03:10:21[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは、先生。最近、仕事でもプライベートでも、「もっとこうすべき」なのに、できてないなと感じることが多くて、なんだか焦っているんです。
[32m10-26 03:10:22[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(447488,)
[32m10-26 03:10:22[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
なるほど、「もっとこうすべき」と。ええ、焦りを感じるのですね。具体的に、どんな時にそう感じますか？
[32m10-26 03:10:22[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(366080,)
[32m10-26 03:10:22[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい。仕事だと、もっと効率的に動いて、周りの期待に応えなければいけないのに、って。家でも、完璧にこなすべきだと思ってしまって。
[32m10-26 03:10:22[0m |[1m  INFO  [0m| tts_m



[32m10-26 03:10:24[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(261632,)
[32m10-26 03:10:24[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そうなんですね。はい。ご自身を責めてしまう。その声は、Bさんを助けていますか？それとも、苦しくさせていますか？
[32m10-26 03:10:24[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(446464,)
[32m10-26 03:10:24[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
苦しい、ですね。でも、そうしないといけない、と思ってしまうんです。
[32m10-26 03:10:24[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(207872,)
[32m10-26 03:10:24[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
ええ、よくわかります。完璧に「すべき」という考え方は、Bさんを支えてきた面もあるでしょう。でも今、それが少し苦しめているのかもしれない、という視点も、少しだけ持ってみてはいかがでしょうか。
[32m10-26 03:10:24[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(659968,)
[32m10-26 03:10:24[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio dat

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:37[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_81...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_81[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m149.888[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:37[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:06[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_81...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_81[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m152.904[0m seconds                                 


[32m10-26 03:16:23[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは。今日は〇〇さんとお話できるのを楽しみにしていました。今のお気持ち、お聞かせいただけますか？
[32m10-26 03:16:23[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(341504,)
[32m10-26 03:16:23[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい。最近、仕事もプライベートも漠然とした不安と焦りがあって、何が原因か分からなくて。
[32m10-26 03:16:23[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(318464,)
[32m10-26 03:16:23[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そうなんですね。ええ、不安や焦りを感じていらっしゃるんですね。お辛いですね。どんな時にそう感じますか？
[32m10-26 03:16:24[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(365568,)
[32m10-26 03:16:24[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
仕事では締め切りに追われるようで。家ではあれこれ考えてしまい、何も手につかず時間だけが過ぎていく感じです。
[32m10-26 03:16:24[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data gen

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:19[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_82...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_82[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m129.065[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:16[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_82...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_82[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m125.191[0m seconds                                 


[32m10-26 03:21:14[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは、Bさん。今日はどんな様子でいらっしゃいますか？
[32m10-26 03:21:14[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(167424,)
[32m10-26 03:21:14[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは、先生。なんだか、最近ずっと焦っているような気がして。あれもこれも「しなきゃいけない」って、いつも頭の中がそれでいっぱいで。
[32m10-26 03:21:14[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(437760,)
[32m10-26 03:21:14[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
なるほど、そうなんですね。「あれもこれも、しなきゃいけない」と、頭の中でずっと感じていらっしゃるんですね。ええ。
[32m10-26 03:21:15[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(389120,)
[32m10-26 03:21:15[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい。仕事もそうだし、家事も、もっと効率よく「すべき」だし。趣味の時間だって、もっと有意義に「使わなきゃ」って思ってしまうんです。
[32m10-26 03:21:15[0m |[1m  INFO  [0m| tts_model.py:3

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:34[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_83...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_83[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m146.870[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:37[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_83...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_83[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m148.801[0m seconds                                 


[32m10-26 03:27:03[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
Bさん、今日はありがとうございます。どんなことでお話しいただけますか？
[32m10-26 03:27:04[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(219648,)
[32m10-26 03:27:04[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい。最近、漠然とした不安や焦りがあって、仕事も手につきません。
[32m10-26 03:27:04[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(232448,)
[32m10-26 03:27:04[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そうなんですね。不安や焦り、お仕事にも影響が。
[32m10-26 03:27:04[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(183808,)
[32m10-26 03:27:04[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
ええ、ミスが増えてしまって。それがまたストレスで。
[32m10-26 03:27:04[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(173568,)
[32m10-26 03:27:04[0m |[1m  INFO  [0m| tts_model.

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:12[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_84...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_84[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m124.664[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h[?25l

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:10[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:04[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_84...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_84[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m122.210[0m seconds                                 


[32m10-26 03:31:48[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
Bさん、今日はよくお越しくださいました。少し緊張されていますか？今日はゆっくりお話を聞かせていただけたらと思いますので、どうぞ楽にしてくださいね。
[32m10-26 03:31:49[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(503296,)
[32m10-26 03:31:49[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、ありがとうございます。なんだか、うまく話せるか不安で…。最近、仕事でもプライベートでも、漠然とした焦りや不安を感じることが多くて、どうしたらいいのか分からなくなってしまって。
[32m10-26 03:31:49[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(624128,)
[32m10-26 03:31:49[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そうなんですね。漠然とした焦りや不安…、それはとてもお辛い気持ちですね。どんな時に特にそう感じることが多いですか？差し支えなければ、もう少し詳しくお聞かせいただけますか。
[32m10-26 03:31:49[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(616960,)
[32m10-26 03:31:49[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
ええ。仕事では、常に締め切りに追われているような感覚で

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:41[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_85...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_85[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m153.740[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:43[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_85...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_85[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m155.671[0m seconds                                 


[32m10-26 03:37:36[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
〇〇さん、こんにちは。今日はこれまでのセッションを少し振り返って、これからのことを一緒に考えていきませんか？
[32m10-26 03:37:36[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(344064,)
[32m10-26 03:37:36[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、そうですね。もうそんな時期なんですね…早いような、長かったような。
[32m10-26 03:37:37[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(287232,)
[32m10-26 03:37:37[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
ええ、そうですね。色々なことをお話してくださいましたもんね。これまでのセッションを振り返ってみて、〇〇さんの中で何か変化を感じることはありますか？
[32m10-26 03:37:37[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(475136,)
[32m10-26 03:37:37[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そうですね…最初は、仕事も日常生活も、漠然とした不安ばかりで、何から手をつけていいのかも分からなかったんですけど。
[32m10-26 03:37:37[0m |[1m  INFO  [0m| tts_model.p

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:29[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_86...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_86[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m141.999[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:28[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_86...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_86[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m138.223[0m seconds                                 


[32m10-26 03:43:23[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
Bさん、こんにちは。今日はこれまでのセッションを振り返り、今後についてお話しできればと思いますが、いかがですか？
[32m10-26 03:43:24[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(379904,)
[32m10-26 03:43:24[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、ありがとうございます。もうそんな時期なんですね。少し寂しいですが、振り返るのは良い機会ですね。
[32m10-26 03:43:24[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(373248,)
[32m10-26 03:43:24[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そうですね、ええ。この数ヶ月、色々なことをお話しくださいました。ご自身で、『変わったな』とか、『気づきがあったな』と思うことはありますか？
[32m10-26 03:43:24[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(451584,)
[32m10-26 03:43:24[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
うーん、最初は漠然とした不安ばかりだったんですけど、今は少し冷静に、『これはどういう状況だろう』って、立ち止まって考えられるようになった気がします。
[32m10-26 03:43:24[0m |

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:23[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_87...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_87[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m138.404[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:24[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_87...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_87[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m138.942[0m seconds                                 


[32m10-26 03:49:28[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
〇〇さん、こんにちは。今日はこれまでのセッションを振り返りながら、終結に向けて少しお話しできればと思っています。
[32m10-26 03:49:28[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(358400,)
[32m10-26 03:49:28[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、本当にあっという間でした。最初は漠然とした不安ばかりだったのが、今は少し整理できた気がします。
[32m10-26 03:49:29[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(356864,)
[32m10-26 03:49:29[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そうでしたね。『性格を直したい』というお話から始まりました。特に、どのようなことが〇〇さんにとって印象に残っていますか？
[32m10-26 03:49:29[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(380928,)
[32m10-26 03:49:29[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
自分の認知・感情・行動が繋がっていると気づけたことです。それを図にしてもらった時、すごく腑に落ちました。
[32m10-26 03:49:29[0m |[1m  INFO  [0m| tts_model.py:

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:22[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_88...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_88[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m139.513[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:23[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_88...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_88[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m137.171[0m seconds                                 


[32m10-26 03:54:48[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは、今日は来てくださってありがとうございます。何か、今お話ししたいことや、気になっていることなどありますか？
[32m10-26 03:54:48[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(371712,)
[32m10-26 03:54:48[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、こんにちは。最近、なんだか仕事でもプライベートでも、ずっと漠然とした不安があって、落ち着かないんです。何から話せばいいのかも、よくわからなくて…。
[32m10-26 03:54:49[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(527360,)
[32m10-26 03:54:49[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そうなんですね。漠然とした不安、ということですね。ええ、大丈夫ですよ。今、感じていらっしゃることを、ゆっくりお話しいただければと思います。
[32m10-26 03:54:49[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(490496,)
[32m10-26 03:54:49[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
仕事では、やることがたくさんあるのに、なかなか手につかなくて。周りの人はどんどんこなしているのに、自分だけ置いていかれているような焦りを感じてし

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:34[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:06[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_89...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_89[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m150.226[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:28[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_89...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_89[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m141.734[0m seconds                                 


[32m10-26 04:00:30[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは、Bさん。今週はいかがでしたか？何か気になることはありましたか？
[32m10-26 04:00:30[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(248320,)
[32m10-26 04:00:30[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは、A先生。なんだか、ずっと焦っているような感覚で…。やるべきことがあるのに、手がつけられなくて、情けないな、と。
[32m10-26 04:00:30[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(412160,)
[32m10-26 04:00:30[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
ええ、なるほど。やるべきことがあっても手がつけられず、ご自身を情けないと感じてしまうのですね。もう少し詳しくお聞かせいただけますか？
[32m10-26 04:00:31[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(465920,)
[32m10-26 04:00:31[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい。仕事でも「完璧にやらなきゃ」、家事も「きちんとこなすべき」って、いつも頭にあるんです。でも、疲れてしまって、思うように動けないことが多くて…。
[32m10-26 04:00:31[0m |[1m  INFO 



[32m10-26 04:00:31[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(539136,)
[32m10-26 04:00:31[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
なんだか常に追い立てられているようで、休んでいても落ち着かないんです。休むこと自体、いけないことのような気がしてしまって。
[32m10-26 04:00:32[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(370176,)
[32m10-26 04:00:32[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
ええ、休むことさえ、いけないことのように感じてしまうのですね。その「〜すべき」という考えは、いつ頃からBさんの心の中に強くあるように感じられますか？




[32m10-26 04:00:32[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(524800,)
[32m10-26 04:00:32[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
うーん…物心ついた頃から、そう考えていた気がします。両親が厳しかったのも、あるかもしれません。
[32m10-26 04:00:32[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(351232,)
[32m10-26 04:00:32[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そうでしたか。小さい頃からの経験が、今のBさんの考え方に影響しているのですね。もし、少しだけ「〜すべき」という気持ちを緩めてみたら、どうなると思いますか？




[32m10-26 04:00:32[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(555520,)
[32m10-26 04:00:32[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
緩める、ですか…。少し怖いような気もします。全部崩れてしまうんじゃないかって。
[32m10-26 04:00:33[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(262144,)
[32m10-26 04:00:33[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
ええ、そう感じますよね。でも、もしかしたら、肩の力が抜けることで、もっと楽に物事を進められるかもしれません。今日は、「完璧にやらなければ」という考えがBさんを苦しめていること、その背景を探ることができました。次回は、この「〜すべき」という考えとどう向き合うか、一緒に考えていきましょう。




[32m10-26 04:00:33[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(1049088,)
[32m10-26 04:00:33[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、そうですね。少し、気持ちが整理できた気がします。ぜひ、お願いします。
[32m10-26 04:00:33[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(275968,)
[32m10-26 04:00:33[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
ええ、良かったです。では、次回の予約ですが、いかがでしょうか？
[32m10-26 04:00:34[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(206336,)
[32m10-26 04:00:34[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
来週の同じ時間でお願いします。
[32m10-26 04:00:34[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(96768,)
[32m10-26 04:00:34[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
かしこまりました。では、また来週、お待ちしておりますね。
[32m10-26 04:00:34[0m |[1m  INFO  [0m| tts_model.py:3

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:27[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_90...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_90[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m141.762[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:31[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_90...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_90[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m143.799[0m seconds                                 


[32m10-26 04:06:21[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは、今日はよくいらっしゃいましたね。どうぞ、楽な姿勢でお座りください。
[32m10-26 04:06:22[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(254976,)
[32m10-26 04:06:22[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、ありがとうございます。
[32m10-26 04:06:22[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(98816,)
[32m10-26 04:06:22[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
ええ。今日は、どのようなことについてお話ししたいと思われましたか？
[32m10-26 04:06:22[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(201728,)
[32m10-26 04:06:22[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
最近、仕事もそうですが、漠然とした不安や焦りを感じています。何が原因か、自分でも分からなくて。
[32m10-26 04:06:23[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(368128,)
[32m10-26 04:06:23[0m |[1m  INFO 

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:17[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h[?25l

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_91...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_91[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m126.624[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:17[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_91...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_91[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m131.039[0m seconds                                 


[32m10-26 04:11:16[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
〇〇さん、こんにちは。今日はこれまでのセッションを少し振り返ってみて、この先のことについてお話しできたらと思うのですが、いかがでしょうか？
[32m10-26 04:11:17[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(445952,)
[32m10-26 04:11:17[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、なんだかもうそんな時期なんだなあって感じですね。あっという間でした。
[32m10-26 04:11:17[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(254464,)
[32m10-26 04:11:17[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
ええ、そうですね。この数ヶ月を振り返ってみて、〇〇さんご自身で、何か変わったなと感じることはありますか？
[32m10-26 04:11:17[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(346112,)
[32m10-26 04:11:17[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
うーん…最初は、漠然と仕事も私生活も、全部うまくいってない気がして焦っていたんですけど、今は少し、何に不安を感じているのか、わかるようになってきた気がします。
[32m10-26 04:11:18[0m |[1m  INFO  

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:30[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_92...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_92[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m143.028[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:29[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_92...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_92[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m142.315[0m seconds                                 


[32m10-26 04:16:49[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは、〇〇さん。今日は来てくださってありがとうございます。今日はどんなことについてお話ししたいですか？
[32m10-26 04:16:49[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(299520,)
[32m10-26 04:16:49[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、なんか、最近仕事のこともそうですけど、何となく毎日がうまくいってない気がして…漠然とした不安があるんです。
[32m10-26 04:16:49[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(390144,)
[32m10-26 04:16:49[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そうなんですね。漠然とした不安、ですか。ええ、はい。
[32m10-26 04:16:49[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(194560,)
[32m10-26 04:16:49[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい。仕事でも集中できないことが増えて、焦るのに何も手につかなくて…。家でも全然リラックスできなくて。
[32m10-26 04:16:49[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated succ

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:19[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_93...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_93[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m131.395[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:16[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_93...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_93[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m127.130[0m seconds                                 


[32m10-26 04:21:54[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは、Bさん。今日は、どんなことをお話ししたい気分ですか？無理なく、お話しできる範囲で大丈夫ですよ。
[32m10-26 04:21:54[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(353792,)
[32m10-26 04:21:54[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは。最近、漠然とした不安がずっとあって、何から話したらいいか分からなくて…。
[32m10-26 04:21:55[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(300544,)
[32m10-26 04:21:55[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そうなんですね。漠然とした不安。はい。何か、特に気になっていることはありますか？
[32m10-26 04:21:55[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(293376,)
[32m10-26 04:21:55[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
仕事が一番ですが、家でも落ち着かなくて。休日も休んだ気がしません。
[32m10-26 04:21:55[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(249856,)


[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:19[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_94...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_94[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m133.075[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:21[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_94...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_94[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m137.126[0m seconds                                 


[32m10-26 04:26:58[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
Bさん、前回のお話から、日常生活で「〜すべきだ」と感じることが、少しお辛く感じられている、というお話がありましたね。今日はそのあたりを少し深掘りできたらと思うのですが、いかがでしょうか？




[32m10-26 04:26:58[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(651264,)
[32m10-26 04:26:58[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、なんだか、いつも「こうしなきゃ」とか「もっと頑張るべきだ」って、頭の中で繰り返している気がして…疲れてしまいます。
[32m10-26 04:26:58[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(397824,)
[32m10-26 04:26:58[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そうなんですね。ええ。具体的に、どんな時にそう強く感じることが多いですか？ 例えば、最近あった出来事などでも構いませんよ。
[32m10-26 04:26:59[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(438784,)
[32m10-26 04:26:59[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
ええと、仕事で新しいプロジェクトを任された時なんですが、「完璧にこなさなければならない」って思ってしまって。少しでもミスがあると、もうだめだ、って。
[32m10-26 04:26:59[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(475648,)
[32m10-26 04:26:59[0m |[1m  INFO  [0m| tts_model.py:259 | Start generat

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:38[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_95...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_95[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m155.207[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:46[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_95...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_95[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m165.288[0m seconds                                 


[32m10-26 04:33:18[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは、Bさん。今日はどんなことをお話ししたいですか？少しでも心が軽くなるような時間になれば嬉しいのですが。
[32m10-26 04:33:18[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(370688,)
[32m10-26 04:33:18[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは。最近、なんだかいつも焦っていて…もっとしっかりしなきゃ、って思うんですけど、なかなかうまくいかなくて。
[32m10-26 04:33:19[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(398848,)
[32m10-26 04:33:19[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そうなんですね。ええ、「もっとしっかりしなきゃ」というお気持ち、もう少し詳しく聞かせてもらえますか？
[32m10-26 04:33:19[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(324608,)
[32m10-26 04:33:19[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい。仕事でもプライベートでも、「完璧にこなすべきだ」って思ってしまうんです。手を抜くとダメだって。休む時も落ち着かなくて…。
[32m10-26 04:33:19[0m |[1m  INFO  [0m| tts_m



[32m10-26 04:33:20[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(783872,)
[32m10-26 04:33:20[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そうですね…。いつも息苦しいです。でも、他にどう考えたらいいのか、わからなくて…。
[32m10-26 04:33:20[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(313856,)
[32m10-26 04:33:20[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
ええ、大丈夫ですよ。今日は、Bさんが「〜すべきだ」という考えで苦しんでいるかもしれない、ということに気づけただけでも大きな一歩です。次回は、その「〜すべきだ」がどんな時に強く現れるのか、もう少し具体的に見ていきましょうか。




[32m10-26 04:33:21[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(762368,)
[32m10-26 04:33:21[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、わかりました。
[32m10-26 04:33:21[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(78336,)
[32m10-26 04:33:21[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
ありがとうございます。今日のところはここまでとしましょうか。次回の予約は、来週の同じ曜日、時間でよろしいでしょうか？
[32m10-26 04:33:21[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(392704,)
[32m10-26 04:33:21[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、大丈夫です。よろしくお願いします。
[32m10-26 04:33:21[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(146432,)
[32m10-26 04:33:21[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こちらこそ、また来週お待ちしておりますね。ゆっくりお過ごしください。
[32m10-26 04:33:21[0m |[1m  INFO  [0m| tts_

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:26[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_96...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_96[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m142.550[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:25[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:06[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_96...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_96[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m145.721[0m seconds                                 


[32m10-26 04:39:06[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
〇〇さん、こんにちは。前回の漠然とした焦りの部分、今日は少し掘り下げてみませんか？
[32m10-26 04:39:06[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(296960,)
[32m10-26 04:39:06[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい。最近も、『もっとこうすべきだ』とか、『これくらいはできて当然だ』と、自分を追い込んでしまうんです。
[32m10-26 04:39:07[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(309248,)
[32m10-26 04:39:07[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
なるほど。その『〜すべきだ』という考え方が、焦りにつながっているんですね。どんな時にそう感じますか？
[32m10-26 04:39:07[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(343040,)
[32m10-26 04:39:07[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
仕事で新しいプロジェクトを任されたり、家事をしている時でも、『完璧にこなすべき』って。うまくいかないと、すごく落ち込みます。
[32m10-26 04:39:07[0m |[1m  INFO  [0m| tts_model.py:324 | Audio d



[32m10-26 04:39:09[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(500224,)
[32m10-26 04:39:09[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、やってみます。
[32m10-26 04:39:09[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(77824,)
[32m10-26 04:39:09[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
ありがとうございます。今日は、ご自身の『〜すべき思考』について見つめる良い機会になったと思います。また次回、その時の気持ちの変化など、お聞かせくださいね。
[32m10-26 04:39:09[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(508928,)
[32m10-26 04:39:09[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、ありがとうございました。
[32m10-26 04:39:09[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(100352,)


[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:19[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_97...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_97[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m130.065[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:20[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_97...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_97[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m137.284[0m seconds                                 


[32m10-26 04:44:18[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは、〇〇さん。今日は何かお話ししたいことはありますか？
[32m10-26 04:44:18[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(179200,)
[32m10-26 04:44:18[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは、先生。最近、どうも落ち着かなくて。何をしていても「もっとちゃんとしないと」って、焦ってしまうんです。
[32m10-26 04:44:18[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(367104,)
[32m10-26 04:44:18[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
「もっとちゃんとしないと」ですね。どんな時にそう思われますか？




[32m10-26 04:44:19[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(190464,)
[32m10-26 04:44:19[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
仕事でも家事でも、「完璧にやるべきだ」って。でも、それができなくて落ち込んだり、不安になったり…。
[32m10-26 04:44:19[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(323072,)
[32m10-26 04:44:19[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
「完璧にやるべきだ」というお気持ち。その時、どんな感情が湧きますか？




[32m10-26 04:44:19[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(224768,)
[32m10-26 04:44:19[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
不安と焦りですね。ずっと頑張っていないといけないような気がして、苦しいです。
[32m10-26 04:44:20[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(237056,)
[32m10-26 04:44:20[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
ええ、不安や焦りですね。「〜すべきだ」という考えが、〇〇さんを苦しめているのかもしれませんね。




[32m10-26 04:44:20[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(314368,)
[32m10-26 04:44:20[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
確かに、そうかも…。いつも自分を追い立てている気がします。
[32m10-26 04:44:20[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(193536,)
[32m10-26 04:44:20[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
もし、「完璧でなくても大丈夫」だとしたら、どう感じられるでしょう？
[32m10-26 04:44:20[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(212992,)
[32m10-26 04:44:20[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
そう思えたら、少し楽になるかな…でも、なかなかそうは思えなくて。
[32m10-26 04:44:21[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(254976,)
[32m10-26 04:44:21[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい。すぐに変わらなくても大丈夫ですよ。まず、その「〜すべきだ」という考えに気づくことが大切です。




[32m10-26 04:44:21[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(322048,)
[32m10-26 04:44:21[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい。自分では気づいていなかったです。
[32m10-26 04:44:21[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(122368,)
[32m10-26 04:44:21[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
素晴らしい気づきです。次回まで、日常生活で「〜すべきだ」と思った時に、どんな気持ちになるか、少し意識して観察してみるのはいかがでしょう？




[32m10-26 04:44:21[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(465920,)
[32m10-26 04:44:21[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
わかりました。試してみます。
[32m10-26 04:44:21[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(107520,)
[32m10-26 04:44:21[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、ありがとうございます。では、今日のところはここまで。来週またお話し聞かせてくださいね。
[32m10-26 04:44:22[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(317952,)
[32m10-26 04:44:22[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はい、来週もよろしくお願いします。
[32m10-26 04:44:22[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(120832,)


[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:17[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_98...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_98[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m126.462[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:17[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_98...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_98[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m134.706[0m seconds                                 


[32m10-26 04:49:19[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
〇〇さん、こんにちは。今日は来てくださってありがとうございます。少し緊張されているかもしれませんが、ゆっくり、〇〇さんのペースでお話しくださいね。今日はどんなことをお話ししたい気分ですか？
[32m10-26 04:49:19[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(587264,)
[32m10-26 04:49:19[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは。あの、なんだか最近、仕事のこともそうですし、家でもなんだか落ち着かないというか、漠然とした不安があるんです。特に何かあったわけじゃないんですけど…
[32m10-26 04:49:20[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(529408,)
[32m10-26 04:49:20[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
ええ、そうなんですね。漠然とした不安が、仕事だけでなく、日常生活でも感じていらっしゃるのですね。はい。何か具体的なきっかけというよりは、日々の中で、なんとなくそういった気持ちが募ってきた、ということでしょうか。
[32m10-26 04:49:20[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
(734208,)
[32m10-26 04:49:20[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from tex

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m   0%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0/1 [0m [ [33m0:00:00[0m < [36m-:--:--[0m , [31m? it/s[0m ]

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:36[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_99...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mA_99[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m151.608[0m seconds                                 
[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                      
[2;36m [0m[32mINFO    [0m Loading corpus from source files[33m...[0m                                   


[2K[35m   1%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/100 [0m [ [33m0:00:01[0m < [36m-:--:--[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m1[0m file, average number of utterances per       
[2;36m [0m         speaker: [1;36m1.0[0m                                                          
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                  
[2;36m [0m         MFA will only use [1;36m1[0m jobs. Use the --single_speaker flag if you would  
[2;36m [0m         like to split utterances across jobs regardless of their speaker.     
[2;36m [0m[32mINFO    [0m Normalizing text[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Generating MFCCs[33m...[0m                                                   


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:02[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Calculating CMVN[33m...[0m                                                   
[2;36m [0m[32mINFO    [0m Generating final features[33m...[0m                                          


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Creating corpus split[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                          
[2;36m [0m[32mINFO    [0m Performing first-pass alignment[33m...[0m                                    
[2;36m [0m[32mINFO    [0m Generating alignments[33m...[0m                                              


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:40[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Collecting phone and word alignments from alignment lattices[33m...[0m       


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:05[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Analyzing alignment quality[33m...[0m                                        


[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:01[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[2K[35m 100%[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1/1 [0m [ [33m0:00:00[0m < [36m0:00:00[0m , [31m? it/s[0m ]
[?25h

[2;36m [0m[32mINFO    [0m Exporting alignment TextGrids to                                      
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_99...[0m                                                               
[2;36m [0m[32mINFO    [0m Finished exporting TextGrids to                                       
[2;36m [0m         [35m/users/s1f102201582/Github/jmoshi-ft/gen_dialogue/data/sbv/mfa_output/[0m
[2;36m [0m         [95mB_99[0m!                                                                 
[2;36m [0m[32mINFO    [0m Done! Everything took [1;36m155.041[0m seconds                                 


CPU times: user 44min 21s, sys: 1min 16s, total: 45min 37s
Wall time: 9h 10min 33s
