In [1]:
# !pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
# !pip install style-bert-vits2
# !pip install numpy==1.26.4
!pip list

Package                                  Version
---------------------------------------- ------------
aiohappyeyeballs                         2.6.1
aiohttp                                  3.13.1
aiosignal                                1.4.0
annotated-types                          0.7.0
anyio                                    4.11.0
argon2-cffi                              25.1.0
argon2-cffi-bindings                     25.1.0
arrow                                    1.4.0
asttokens                                3.0.0
async-lru                                2.0.5
attrs                                    25.4.0
audioread                                3.0.1
babel                                    2.17.0
backoff                                  2.2.1
bcrypt                                   5.0.0
beautifulsoup4                           4.14.2
bleach                                   6.2.0
blis                                     1.3.0
build                                    1.3

In [2]:
# BERTモデルをロード（ローカルに手動でダウンロードする必要はありません）

from style_bert_vits2.nlp import bert_models
from style_bert_vits2.constants import Languages


bert_models.load_model(Languages.JP, "ku-nlp/deberta-v2-large-japanese-char-wwm")
bert_models.load_tokenizer(Languages.JP, "ku-nlp/deberta-v2-large-japanese-char-wwm")

  from .autonotebook import tqdm as notebook_tqdm


[32m10-22 01:29:11[0m |[1m  INFO  [0m| bert_models.py:92 | Loaded the Languages.JP BERT model from ku-nlp/deberta-v2-large-japanese-char-wwm
[32m10-22 01:29:12[0m |[1m  INFO  [0m| bert_models.py:154 | Loaded the Languages.JP BERT tokenizer from ku-nlp/deberta-v2-large-japanese-char-wwm


BertJapaneseTokenizer(name_or_path='ku-nlp/deberta-v2-large-japanese-char-wwm', vocab_size=22012, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [3]:
from pathlib import Path
from huggingface_hub import hf_hub_download


# model_file = "koharune-ami/koharune-ami.safetensors"
# config_file = "koharune-ami/config.json"
# style_file = "koharune-ami/style_vectors.npy"
# hf_repo = "litagin/sbv2_koharune_ami"
# model_file = "amitaro/amitaro.safetensors"
# config_file = "amitaro/config.json"
# style_file = "amitaro/style_vectors.npy"
# hf_repo = "litagin/sbv2_amitaro"
model_file = "jvnv-F2-jp/jvnv-F2_e166_s20000.safetensors"
config_file = "jvnv-F2-jp/config.json"
style_file = "jvnv-F2-jp/style_vectors.npy"
hf_repo = "litagin/style_bert_vits2_jvnv"

for file in [model_file, config_file, style_file]:
    print(file)
    hf_hub_download(hf_repo, file, local_dir="model_assets")

  from .autonotebook import tqdm as notebook_tqdm


jvnv-F2-jp/jvnv-F2_e166_s20000.safetensors
jvnv-F2-jp/config.json
jvnv-F2-jp/style_vectors.npy


In [4]:
# 上でダウンロードしたモデルファイルを指定して音声合成のテスト

from style_bert_vits2.tts_model import TTSModel

assets_root = Path("model_assets")

model = TTSModel(
    model_path=assets_root / model_file,
    config_path=assets_root / config_file,
    style_vec_path=assets_root / style_file,
    device="cuda",
)

In [5]:
from IPython.display import Audio, display
import numpy as np

text = """こんにちは。
今日は元気がなさそうですね。
どうかしましたか？
私にできることがあれば何でも言ってください。
"""
assist_text = """泣きそうな感じでお願いします
"""

sr, audio = model.infer(
    text = text,
    style='Neutral',
    style_weight=1,
    split_interval = 0.3,
    use_assist_text = True,
    assist_text = assist_text
)

print(isinstance(audio, np.ndarray))
display(Audio(audio, rate=sr))

[32m10-22 01:29:13[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
こんにちは。
今日は元気がなさそうですね。
どうかしましたか？
私にできることがあれば何でも言ってください。

[32m10-22 01:29:13[0m |[1m  INFO  [0m| infer.py:24 | Using JP-Extra model


  WeightNorm.apply(module, name, dim)


[32m10-22 01:29:14[0m |[1m  INFO  [0m| safetensors.py:50 | Loaded 'model_assets/jvnv-F2-jp/jvnv-F2_e166_s20000.safetensors' (iteration 166)


  import pkg_resources


ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

In [5]:
help(model.infer)

Help on method infer in module style_bert_vits2.tts_model:

infer(text: str, language: style_bert_vits2.constants.Languages = <Languages.JP: 'JP'>, speaker_id: int = 0, reference_audio_path: Optional[str] = None, sdp_ratio: float = 0.2, noise: float = 0.6, noise_w: float = 0.8, length: float = 1.0, line_split: bool = True, split_interval: float = 0.5, assist_text: Optional[str] = None, assist_text_weight: float = 1.0, use_assist_text: bool = False, style: str = 'Neutral', style_weight: float = 1.0, given_phone: Optional[list[str]] = None, given_tone: Optional[list[int]] = None, pitch_scale: float = 1.0, intonation_scale: float = 1.0) -> tuple[int, numpy.ndarray[typing.Any, numpy.dtype[typing.Any]]] method of style_bert_vits2.tts_model.TTSModel instance
    テキストから音声を合成する。

    Args:
        text (str): 読み上げるテキスト
        language (Languages, optional): 言語. Defaults to Languages.JP.
        speaker_id (int, optional): 話者 ID. Defaults to 0.
        reference_audio_path (Optional[str], opti