In [1]:
import os
import json

class HParams:
    def __init__(self, **kwargs):
        for key, value in kwargs.items():
            if isinstance(value, dict):
                # Recursively turn dictionaries into HParams
                value = HParams(**value)
            setattr(self, key, value)

    def keys(self):
        return self.__dict__.keys()

    def __getitem__(self, key):
        return getattr(self, key)

    def __setitem__(self, key, value):
        setattr(self, key, value)


def get_hparams(init=True, config_path="./configs/vits2_ljs_ring.json", model_name="test"):
    # Model directory setup
    model_dir = os.path.join("./logs", model_name)

    if not os.path.exists(model_dir):
        os.makedirs(model_dir)

    config_save_path = os.path.join(model_dir, "config.json")

    if init:
        # Read and save the configuration file
        with open(config_path, "r") as f:
            data = f.read()
        with open(config_save_path, "w") as f:
            f.write(data)
    else:
        # Load the saved configuration file
        with open(config_save_path, "r") as f:
            data = f.read()

    config = json.loads(data)

    # Create HParams object
    hparams = HParams(**config)
    hparams.model_dir = model_dir

    # Ensure default values for model and data
    if not hasattr(hparams, "model"):
        hparams.model = HParams()
    if not hasattr(hparams, "data"):
        hparams.data = HParams()

    return hparams


# Example usage in Jupyter Notebook
config_path = "./configs/vits2_ljs_ring.json"  # Specify your config file path
model_name = "test"  # Specify your model name

hps = get_hparams(init=True, config_path=config_path, model_name=model_name)

In [3]:
import utils
import torch
import itertools
from text.symbols import symbols
from torch.nn.parallel import DistributedDataParallel as DDP
from models import (
    SynthesizerTrn,
    MultiPeriodDiscriminator,
    MultiScaleSubbandCQTDiscriminator,
    AVAILABLE_FLOW_TYPES,
)
if (
    "use_mel_posterior_encoder" in hps.model.keys()
    and hps.model.use_mel_posterior_encoder == True
):
    print("Using mel posterior encoder for VITS2")
    posterior_channels = 80  # vits2
    hps.data.use_mel_posterior_encoder = True
else:
    print("Using lin posterior encoder for VITS1")
    posterior_channels = hps.data.filter_length // 2 + 1
    hps.data.use_mel_posterior_encoder = False
if (
    "use_transformer_flows" in hps.model.keys()
    and hps.model.use_transformer_flows == True
):
    use_transformer_flows = True
    transformer_flow_type = hps.model.transformer_flow_type
    print(f"Using transformer flows {transformer_flow_type} for VITS2")
    assert (
        transformer_flow_type in AVAILABLE_FLOW_TYPES
    ), f"transformer_flow_type must be one of {AVAILABLE_FLOW_TYPES}"
else:
    print("Using normal flows for VITS1")
    use_transformer_flows = False

if (
    "use_spk_conditioned_encoder" in hps.model.keys()
    and hps.model.use_spk_conditioned_encoder == True
):
    if hps.data.n_speakers == 0:
        print("Warning: use_spk_conditioned_encoder is True but n_speakers is 0")
    print(
        "Setting use_spk_conditioned_encoder to False as model is a single speaker model"
    )
    use_spk_conditioned_encoder = False
else:
    print("Using normal encoder for VITS1")
    use_spk_conditioned_encoder = False

if (
    "use_noise_scaled_mas" in hps.model.keys()
    and hps.model.use_noise_scaled_mas == True
):
    print("Using noise scaled MAS for VITS2")
    use_noise_scaled_mas = True
    mas_noise_scale_initial = 0.01
    noise_scale_delta = 2e-6
else:
    print("Using normal MAS for VITS1")
    use_noise_scaled_mas = False
    mas_noise_scale_initial = 0.0
    noise_scale_delta = 0.0

net_g = SynthesizerTrn(
    len(symbols),
    posterior_channels,
    hps.train.segment_size // hps.data.hop_length,
    mas_noise_scale_initial=mas_noise_scale_initial,
    noise_scale_delta=noise_scale_delta,
    **hps.model,
).cuda(0)
net_d = MultiPeriodDiscriminator(hps.model.use_spectral_norm).cuda(0)
net_cqtd = MultiScaleSubbandCQTDiscriminator(hps).cuda(0)

optim_g = torch.optim.AdamW(
    net_g.parameters(),
    hps.train.learning_rate,
    betas=hps.train.betas,
    eps=hps.train.eps,
)
optim_d = torch.optim.AdamW(
    itertools.chain(net_d.parameters(), net_cqtd.parameters()),
    hps.train.learning_rate,
    betas=hps.train.betas,
    eps=hps.train.eps,
)

try:
    _, _, _, epoch_str = utils.load_checkpoint(
        utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), net_g, optim_g
    )
except:
    print("error")

Using mel posterior encoder for VITS2
Using transformer flows pre_conv for VITS2
Using normal encoder for VITS1
Using noise scaled MAS for VITS2
Low pass filter created, time used = 0.0007 seconds
num_octave =  9
No early downsampling is required, downsample_factor =  1
Early downsampling filter created,                         time used = 0.0000 seconds
CQT kernels created, time used = 0.0023 seconds
Low pass filter created, time used = 0.0002 seconds
num_octave =  9
No early downsampling is required, downsample_factor =  1
Early downsampling filter created,                         time used = 0.0000 seconds
CQT kernels created, time used = 0.0038 seconds
Low pass filter created, time used = 0.0002 seconds
num_octave =  9
No early downsampling is required, downsample_factor =  1
Early downsampling filter created,                         time used = 0.0000 seconds
CQT kernels created, time used = 0.0053 seconds
./logs/test/G_352000.pth
INFO:root:Loaded checkpoint './logs/test/G_352000.