In [80]:
import json
import re
from pathlib import Path
import pandas as pd

In [81]:
pd.set_option("future.no_silent_downcasting", True)

In [82]:
def load_json_with_comments(file_path):
    """Load JSON file that may contain comments."""
    with open(file_path, "r") as f:
        content = f.read()
    
    content = re.sub(r"(?<=[\s,])//.*", "", content)

    return json.loads(content)

In [83]:
models_path = Path("/home/aleks/.local/share/tts/")
model_dirs = sorted([d for d in models_path.iterdir() if d.is_dir()])

model_info_list = []
for model_dir in model_dirs:
    config_path = model_dir / "config.json"
    config = load_json_with_comments(config_path)
    model_info = {
        "model_name": model_dir.name,
        **config.get("audio", {})
    }
    model_info_list.append(model_info)

models_df = pd.DataFrame(model_info_list)

In [84]:
default_values = {
    "fft_size": 1024,
    "win_length": 1024,
    "hop_length": 256,
    # "frame_shift_ms": None,
    # "frame_length_ms": None,
    "stft_pad_mode": "reflect",
    "sample_rate": 22050,
    "resample": False,
    "preemphasis": 0.0,
    "ref_level_db": 20,
    "do_sound_norm": False,
    "log_func": "np.log10",
    "do_trim_silence": True,
    "trim_db": 45,
    "do_rms_norm": False,
    # "db_level": None,
    "power": 1.5,
    "griffin_lim_iters": 60,
    "num_mels": 80,
    "mel_fmin": 0.0,
    # "mel_fmax": None,
    "spec_gain": 20,
    "do_amp_to_db_linear": True,
    "do_amp_to_db_mel": True,
    "pitch_fmax": 640.0,
    "pitch_fmin": 1.0,
    "signal_norm": True,
    "min_level_db": -100,
    "symmetric_norm": True,
    "max_norm": 4.0,
    "clip_norm": True,
    # "stats_path": None,
}

columns_to_drop = ["stats_path", "do_trim_silence", "trim_db"]
models_to_drop = [
    "tts_models--en--ljspeech--glow-tts",
    "tts_models--en--ljspeech--vits--neon",
    "vocoder_models--be--common-voice--hifigan",
    "vocoder_models--en--blizzard2013--hifigan_v2",
    "vocoder_models--en--ek1--wavegrad",
    "vocoder_models--en--librispeech100--wavlm-hifigan",
    "vocoder_models--en--librispeech100--wavlm-hifigan_prematched",
    "vocoder_models--en--ljspeech--univnet",
    "vocoder_models--en--sam--hifigan_v2",
    "vocoder_models--nl--mai--parallel-wavegan",
    "vocoder_models--tr--common-voice--hifigan",
    "vocoder_models--uk--mai--multiband-melgan",
    "vocoder_models--universal--libri-tts--wavegrad",
]

def simplify_models_df(df):
    df = df.copy()
    df = df[~df["model_name"].isin(models_to_drop)]

    for col in df.columns:
        if col in default_values:
            df[col] = df[col].fillna(default_values[col])

        if (df[col].nunique(dropna=False) == 1) or (col in columns_to_drop):
            # print(f"Dropping {col}")
            df = df.drop(columns=[col])
    return df

In [85]:
models_df.shape

(23, 33)

In [86]:
simplify_models_df(models_df)

Unnamed: 0,model_name,sample_rate,preemphasis,ref_level_db,log_func,mel_fmin,mel_fmax,spec_gain,signal_norm,pitch_fmin
0,tts_models--en--ljspeech--fast_pitch,22050,0.0,20.0,np.log,0.0,8000.0,1.0,False,1.0
2,tts_models--en--ljspeech--tacotron2-DCA,22050,0.0,20.0,np.log10,50.0,7600.0,1.0,True,0.0
3,tts_models--en--ljspeech--tacotron2-DDC,22050,0.0,20.0,np.log,0.0,8000.0,1.0,False,1.0
4,tts_models--en--ljspeech--vits,22050,0.0,20.0,np.log10,0.0,,20.0,True,1.0
6,tts_models--en--vctk--vits,22050,0.0,20.0,np.log10,0.0,,20.0,True,1.0
7,tts_models--lt--cv--vits,22050,0.0,20.0,np.log10,0.0,,20.0,True,1.0
13,vocoder_models--en--ljspeech--hifigan_v2,22050,0.0,20.0,np.log,0.0,8000.0,1.0,False,1.0
14,vocoder_models--en--ljspeech--multiband-melgan,22050,0.0,0.0,np.log10,50.0,7600.0,1.0,True,0.0
17,vocoder_models--en--vctk--hifigan_v2,22050,0.98,20.0,np.log10,0.0,8000.0,20.0,True,1.0
21,vocoder_models--universal--libri-tts--fullband...,24000,0.0,0.0,np.log10,50.0,7600.0,1.0,True,0.0
