# Prepare filelists for LJSpeech dataset


## Get hyperparameters from config file


In [4]:
import os
import sys

root_path = os.getcwd().split('/')[:-3]
vits_path = '/'.join(root_path)
utils_path = vits_path + '/utils'
sys.path.append(vits_path)
sys.path.append(utils_path)

In [6]:
import pandas as pd
import sys
print(sys.path)
from hparams import get_hparams_from_file
# See: https://github.com/espeak-ng/espeak-ng/blob/master/docs/languages.md
dir_data = vits_path + '/LJSpeech-1.1'
config = "../config.yaml"
symlink = "DUMMY1"
n_val = 100
n_test = 500

hps = get_hparams_from_file(config)

['/usr/lib/python310.zip', '/usr/lib/python3.10', '/usr/lib/python3.10/lib-dynload', '', '/home/aquintero/vits2/venv/lib/python3.10/site-packages', '/home/aquintero/vits2', '/home/aquintero/vits2', '/home/aquintero/vits2', '/home/aquintero/vits2/utils']


In [7]:
hps

{'train': {'log_interval': 100, 'eval_interval': 1000, 'seed': 1234, 'epochs': 20000, 'learning_rate': 0.0002, 'betas': [0.8, 0.99], 'eps': 1e-09, 'batch_size': 64, 'fp16_run': True, 'lr_decay': 0.999875, 'segment_size': 8192, 'init_lr_ratio': 1, 'warmup_epochs': 0, 'c_mel': 45, 'c_kl_text': 0, 'c_kl_dur': 2, 'c_kl_audio': 0.05}, 'data': {'training_files': 'datasets/ljs_base/filelists/train.txt', 'validation_files': 'datasets/ljs_base/filelists/val.txt', 'vocab_file': 'datasets/ljs_base/vocab.txt', 'text_cleaners': ['phonemize_text', 'add_spaces', 'tokenize_text', 'add_bos_eos'], 'cleaned_text': True, 'language': 'en-us', 'bits_per_sample': 16, 'sample_rate': 22050, 'n_fft': 2048, 'hop_length': 256, 'win_length': 1024, 'n_mels': 80, 'f_min': 0, 'f_max': None, 'n_speakers': 0, 'use_mel': True}, 'model': {'inter_channels': 192, 'hidden_channels': 192, 'filter_channels': 768, 'n_heads': 2, 'n_layers': 6, 'n_layers_q': 12, 'n_flows': 8, 'kernel_size': 3, 'p_dropout': 0.1, 'speaker_cond_lay

## Read dataset

Here `normalized_text` contains numbers in the form of words.

**Note**: you may need to replace all `"|"` with `" | "` in the file `metadata.csv`.


In [9]:
data = pd.read_csv(
    f"{dir_data}/metadata_copy.csv",
    sep=r"|",
    header=None,
    names=["file", "text", "normalized_text", "cleaned_text"],
    index_col=False,
    # converter to add .wav to file name
    converters={"file": lambda x: f"{symlink}/{x.strip()}.wav", "text": str.strip, "normalized_text": str.strip},
)
data.head()

Unnamed: 0,file,text,normalized_text,cleaned_text
0,DUMMY1/LJ001-0001.wav,"Printing, in the only sense with which we are ...","Printing, in the only sense with which we are ...",
1,DUMMY1/LJ001-0002.wav,in being comparatively modern.,in being comparatively modern.,
2,DUMMY1/LJ001-0003.wav,For although the Chinese took impressions from...,For although the Chinese took impressions from...,
3,DUMMY1/LJ001-0004.wav,"produced the block books, which were the immed...","produced the block books, which were the immed...",
4,DUMMY1/LJ001-0005.wav,the invention of movable metal letters in the ...,the invention of movable metal letters in the ...,


## Text cleaners

It may take a while, so better to preprocess the text and save it to a file in advance.

**Note** `phonemize_text` takes the longest time.`


In [10]:
# Get index of tokenize_text
text_cleaners = hps.data.text_cleaners

token_idx = text_cleaners.index("tokenize_text")
token_cleaners = text_cleaners[token_idx:]
print(token_cleaners)


# Extract phonemize_text
def separate_text_cleaners(text_cleaners):
    final_list = []
    temp_list = []

    for cleaner in text_cleaners:
        if cleaner == "phonemize_text":
            if temp_list:
                final_list.append(temp_list)
            final_list.append([cleaner])
            temp_list = []
        else:
            temp_list.append(cleaner)

    if temp_list:
        final_list.append(temp_list)

    return final_list


text_cleaners = text_cleaners[:token_idx]
text_cleaners = separate_text_cleaners(text_cleaners)
print(text_cleaners)

['tokenize_text', 'add_bos_eos']
[['phonemize_text'], ['add_spaces']]


In [1]:
from text import tokenizer
from torchtext.vocab import Vocab

text_norm = data["normalized_text"].tolist()
for cleaners in text_cleaners:
    print(f"Cleaning with {cleaners} ...")
    if cleaners[0] == "phonemize_text":
        text_norm = tokenizer(text_norm, Vocab, cleaners, language=hps.data.language)
    else:
        for idx, text in enumerate(text_norm):
            temp = tokenizer(text, Vocab, cleaners, language=hps.data.language)
            text_norm[idx] = temp

data = data.assign(cleaned_text=text_norm)
data.head()

ModuleNotFoundError: No module named 'text'

## Generate and save vocabulary


In [12]:
from torchtext.vocab import build_vocab_from_iterator
from utils.task import load_vocab, save_vocab
from text.symbols import special_symbols, UNK_ID
from typing import List


def yield_tokens(cleaned_text: List[str]):
    for text in cleaned_text:
        yield text.split()


text_norm = data["cleaned_text"].tolist()
vocab = build_vocab_from_iterator(yield_tokens(text_norm), specials=special_symbols)
vocab.set_default_index(UNK_ID)

vocab_file = f"../vocab.txt"
save_vocab(vocab, vocab_file)

vocab = load_vocab(vocab_file)
print(f"Size of vocabulary: {len(vocab)}")
print(vocab.get_itos())

AttributeError: 'float' object has no attribute 'split'

## Token cleaners


In [7]:
from text import detokenizer

text_norm = data["cleaned_text"].tolist()
for idx, text in enumerate(text_norm):
    temp = tokenizer(text, vocab, token_cleaners, language=hps.data.language)
    assert UNK_ID not in temp, f"Found unknown symbol:\n{text}\n{detokenizer(temp)}"
    text_norm[idx] = temp

text_norm = ["\t".join(map(str, text)) for text in text_norm]
data = data.assign(tokens=text_norm)
data.head()

Unnamed: 0,file,text,normalized_text,cleaned_text,tokens
0,DUMMY1/LJ001-0001.wav,"Printing, in the only sense with which we are ...","Printing, in the only sense with which we are ...","p ɹ ˈɪ n t ɪ ŋ , <space> ˈɪ n <space> ð ə <spa...",2\t19\t12\t18\t6\t7\t15\t42\t27\t4\t18\t6\t4\t...
1,DUMMY1/LJ001-0002.wav,in being comparatively modern.,in being comparatively modern.,ˈɪ n <space> b ˈiː ɪ ŋ <space> k ə m p ˈæ ɹ ə ...,2\t18\t6\t4\t25\t36\t15\t42\t4\t13\t8\t17\t19\...
2,DUMMY1/LJ001-0003.wav,For although the Chinese took impressions from...,For although the Chinese took impressions from...,f ɔːɹ <space> ɔː l ð ˈoʊ <space> ð ə <space> t...,2\t23\t59\t4\t92\t16\t11\t39\t4\t11\t8\t4\t50\...
3,DUMMY1/LJ001-0004.wav,"produced the block books, which were the immed...","produced the block books, which were the immed...",p ɹ ə d ˈuː s t <space> ð ə <space> b l ˈɑː k ...,2\t19\t12\t8\t10\t44\t9\t7\t4\t11\t8\t4\t25\t1...
4,DUMMY1/LJ001-0005.wav,the invention of movable metal letters in the ...,the invention of movable metal letters in the ...,ð ə <space> ɪ n v ˈɛ n ʃ ə n <space> ʌ v <spac...,2\t11\t8\t4\t15\t6\t21\t22\t6\t37\t8\t6\t4\t28...


## Save train, val, test filelists


In [8]:
data = data[["file", "tokens"]]
data = data.sample(frac=1).reset_index(drop=True)

data_train = data.iloc[n_val + n_test:]
data_val = data.iloc[:n_val]
data_test = data.iloc[n_val: n_val + n_test]

data_train.to_csv("../filelists/train.txt", sep="|", index=False, header=False)
data_val.to_csv("../filelists/val.txt", sep="|", index=False, header=False)
data_test.to_csv("../filelists/test.txt", sep="|", index=False, header=False)