# Style-Bert-VITS2ライブラリの使用例

`pip install style-bert-vits2`を使った、jupyter notebookでの使用例です。Google colab等でも動きます。

In [1]:
# # PyTorch環境の構築（ない場合）
# # 参照: https://pytorch.org/get-started/locally/

# !pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu12

In [2]:
# # style-bert-vits2のインストール

# !pip install style-bert-vits2

In [3]:
# BERTモデルをロード（ローカルに手動でダウンロードする必要はありません）

from style_bert_vits2.nlp import bert_models
from style_bert_vits2.constants import Languages


bert_models.load_model(Languages.JP, "ku-nlp/deberta-v2-large-japanese-char-wwm")
bert_models.load_tokenizer(Languages.JP, "ku-nlp/deberta-v2-large-japanese-char-wwm")
# bert_models.load_model(Languages.EN, "microsoft/deberta-v3-large")
# bert_models.load_tokenizer(Languages.EN, "microsoft/deberta-v3-large")
# bert_models.load_model(Languages.ZH, "hfl/chinese-roberta-wwm-ext-large")
# bert_models.load_tokenizer(Languages.ZH, "hfl/chinese-roberta-wwm-ext-large")

  from .autonotebook import tqdm as notebook_tqdm


[32m03-16 07:00:17[0m |[1m  INFO  [0m| bert_models.py:92 | Loaded the Languages.JP BERT model from ku-nlp/deberta-v2-large-japanese-char-wwm
[32m03-16 07:00:17[0m |[1m  INFO  [0m| bert_models.py:154 | Loaded the Languages.JP BERT tokenizer from ku-nlp/deberta-v2-large-japanese-char-wwm


BertJapaneseTokenizer(name_or_path='ku-nlp/deberta-v2-large-japanese-char-wwm', vocab_size=22012, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [16]:
# Hugging Faceから試しにデフォルトモデルをダウンロードしてみて、それを音声合成に使ってみる
# model_assetsディレクトリにダウンロードされます

from pathlib import Path
from huggingface_hub import hf_hub_download

# model_name = "jvnv-M1-jp"
# epoch = 158
# s = 14000
# need_download = True

model_name = "jvnv_v1_f1_happy_surprise"
epoch = 100
s = 4000
need_download = False

model_file = f"{model_name}/{model_name}_e{epoch}_s{s}.safetensors"
config_file = f"{model_name}/config.json"
style_file = f"{model_name}/style_vectors.npy"

if need_download:
    for file in [model_file, config_file, style_file]:
        print(file)
        hf_hub_download("litagin/style_bert_vits2_jvnv", file, local_dir="model_assets")

In [None]:
# 上でダウンロードしたモデルファイルを指定して音声合成のテスト

from style_bert_vits2.tts_model import TTSModel

assets_root = Path("model_assets")

model = TTSModel(
    model_path=assets_root / model_file,
    config_path=assets_root / config_file,
    style_vec_path=assets_root / style_file,
    device="cuda",
    # device="cpu",
)

In [18]:
model.load()

# access private field: __net_g by _TTSModel__net_g
model._TTSModel__net_g


[32m03-16 23:14:41[0m |[1m  INFO  [0m| infer.py:24 | Using JP-Extra model




[32m03-16 23:14:43[0m |[1m  INFO  [0m| safetensors.py:50 | Loaded 'model_assets/jvnv_v1_f1_happy_surprise/jvnv_v1_f1_happy_surprise_e100_s4000.safetensors' (iteration 100)


SynthesizerTrn(
  (enc_p): TextEncoder(
    (emb): Embedding(112, 192)
    (tone_emb): Embedding(12, 192)
    (language_emb): Embedding(3, 192)
    (bert_proj): Conv1d(1024, 192, kernel_size=(1,), stride=(1,))
    (style_proj): Linear(in_features=256, out_features=192, bias=True)
    (encoder): Encoder(
      (spk_emb_linear): Linear(in_features=512, out_features=192, bias=True)
      (drop): Dropout(p=0.1, inplace=False)
      (attn_layers): ModuleList(
        (0-5): 6 x MultiHeadAttention(
          (conv_q): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
          (conv_k): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
          (conv_v): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
          (conv_o): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
          (drop): Dropout(p=0.1, inplace=False)
        )
      )
      (norm_layers_1): ModuleList(
        (0-5): 6 x LayerNorm()
      )
      (ffn_layers): ModuleList(
        (0-5): 6 x FFN(
          (conv_1): Conv1d(192, 76

In [None]:
import time
from IPython.display import Audio, display

style = "Neutral"
style_weight = 0
text = "はじめまして。私の名前はインテラです。あなたの名前は何ですか？"
# text = "なるほど。でもなぜダメなんですか？"
# text = "それすごいですね。でも、難しくないですか？"

audio, sr = None, None
elapsed = 0
count = 100
for i in range(count):
    start = time.time()
    sr, audio = model.infer(text=text, style=style, style_weight=style_weight)
    elapsed += time.time() - start
print(f"Elapsed: {elapsed/count:.2f}s")

display(Audio(audio, rate=sr))

[32m03-16 23:45:14[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
はじめまして。私の名前はインテラです。あなたの名前は何ですか？
[32m03-16 23:45:24[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully
Elapsed: 9.95s
