# Prepare filelists for LJSpeech dataset


In [1]:
# See: https://github.com/espeak-ng/espeak-ng/blob/master/docs/languages.md
dir_data = "../../../dataset_uz"
audio_dir = "dataset_uz/audio"
config = "../config.yaml"
# symlink = "DUMMY3"
n_val = 100
n_test = 500

## Get hyperparameters from config file


In [2]:
import pandas as pd
from utils.hparams import get_hparams_from_file

hps = get_hparams_from_file(config)

In [3]:
#check espeak

In [4]:
from phonemizer import phonemize

print('hello world ->', phonemize('hello world', backend='espeak', language='uz'))

hello world -> hællɔ (en)dʌbəljuː(uz) o eɹ el de 


## Read dataset

Here `normalized_text` contains numbers in the form of words.

**Note**: you may need to replace all `"|"` with `" | "` in the file `metadata.csv`.


In [5]:
# data = pd.read_csv(
#     f"{dir_data}/uzbek_tts_output.csv",
#     sep=r"|",
#     header=None,
#     names=["file", "text", "normalized_text", "cleaned_text"],
#     index_col=False,
#     # converter to add .wav to file name
#     converters={"file": lambda x: f"{symlink}/{x.strip()}.wav", "text": str.strip, "normalized_text": str.strip},
# )
# data.head(10)

#symlinnkni o'chirish kerak
data = pd.read_csv(
    f"{dir_data}/uzbek_tts_output.csv",
    sep=r"|",
    header=None,
    names=["file", "text", "normalized_text", "cleaned_text"],
    index_col=False,
    # converter to add .wav to file name
    converters={"file": lambda x: f"{audio_dir}/{x.strip()}.wav", "text": str.strip, "normalized_text": str.strip},
)
data.head(10)

Unnamed: 0,file,text,normalized_text,cleaned_text
0,dataset_uz/audio/utt_0000.wav,U do‘kondan non sotib oldi.,u do‘kondan non sotib oldi.,
1,dataset_uz/audio/utt_0001.wav,Bugun havo juda iliq.,bugun havo juda iliq.,
2,dataset_uz/audio/utt_0002.wav,kVt.s elektr energiya yetkazib bergan.,kilovatt-soniya elektr energiya yetkazib bergan.,
3,dataset_uz/audio/utt_0003.wav,Joriy yilda bu raqam 2 mlrd kVt.soatni tashkil...,joriy yilda bu raqam ikki milliard kilovatt.so...,
4,dataset_uz/audio/utt_0004.wav,"Yangi elektr uzatish liniyasini barpo etish, O...","yangi elektr uzatish liniyasini barpo etish, o...",
5,dataset_uz/audio/utt_0005.wav,Mustaqil Davlatlar Hamdo‘stligi (MDH) missiyas...,mustaqil davlatlar hamdo‘stligi (mdh) missiyas...,
6,dataset_uz/audio/utt_0006.wav,Bu haqda MDH Ijroiya qo‘mitasi raisi Sergey Le...,bu haqda mdh ijroiya qo‘mitasi raisi sergey le...,
7,dataset_uz/audio/utt_0007.wav,"""17 noyabr kuni MDH missiyasi O‘zbekistonda is...",oʻn yettinchi noyabr kuni mdh missiyasi o‘zbek...,
8,dataset_uz/audio/utt_0008.wav,O‘n nafardan ortiq uzoq muddatli va yana 700 n...,o‘n nafardan ortiq uzoq muddatli va yana yetti...,
9,dataset_uz/audio/utt_0009.wav,Ularning deyarli barchasi MDH mamlakatlari vak...,ularning deyarli barchasi mdh mamlakatlari vak...,


## Text cleaners

It may take a while, so better to preprocess the text and save it to a file in advance.

**Note** `phonemize_text` takes the longest time.`


In [6]:
# Get index of tokenize_text
text_cleaners = hps.data.text_cleaners

token_idx = text_cleaners.index("tokenize_text")
token_cleaners = text_cleaners[token_idx:]
print(token_cleaners)


# Extract phonemize_text
def separate_text_cleaners(text_cleaners):
    final_list = []
    temp_list = []

    for cleaner in text_cleaners:
        if cleaner == "phonemize_text":
            if temp_list:
                final_list.append(temp_list)
            final_list.append([cleaner])
            temp_list = []
        else:
            temp_list.append(cleaner)

    if temp_list:
        final_list.append(temp_list)

    return final_list


text_cleaners = text_cleaners[:token_idx]
text_cleaners = separate_text_cleaners(text_cleaners)
print(text_cleaners)

['tokenize_text', 'add_bos_eos']
[['phonemize_text'], ['add_spaces']]


In [7]:
from text import tokenizer
from torchtext.vocab import Vocab

text_norm = data["normalized_text"].tolist()
for cleaners in text_cleaners:
    print(f"Cleaning with {cleaners} ...")
    if cleaners[0] == "phonemize_text":
        text_norm = tokenizer(text_norm, Vocab, cleaners, language=hps.data.language)
    else:
        for idx, text in enumerate(text_norm):
            temp = tokenizer(text, Vocab, cleaners, language=hps.data.language)
            text_norm[idx] = temp

data = data.assign(cleaned_text=text_norm)
data.head()

Cleaning with ['phonemize_text'] ...
Cleaning with ['add_spaces'] ...


Unnamed: 0,file,text,normalized_text,cleaned_text
0,dataset_uz/audio/utt_0000.wav,U do‘kondan non sotib oldi.,u do‘kondan non sotib oldi.,ʊ <space> d ɔ ʔ k ˈɔ n d a n <space> n ˈɔ n <s...
1,dataset_uz/audio/utt_0001.wav,Bugun havo juda iliq.,bugun havo juda iliq.,b ˈu ɡ ʊ n <space> h ˈa v ɔ <space> j ˈu d a <...
2,dataset_uz/audio/utt_0002.wav,kVt.s elektr energiya yetkazib bergan.,kilovatt-soniya elektr energiya yetkazib bergan.,k i l ˈo v a t t s o n ˈi j a <space> ˈe l ɛ k...
3,dataset_uz/audio/utt_0003.wav,Joriy yilda bu raqam 2 mlrd kVt.soatni tashkil...,joriy yilda bu raqam ikki milliard kilovatt.so...,j ˈo ɹ ɪ j <space> j ˈɪ l d a <space> b ʊ <spa...
4,dataset_uz/audio/utt_0004.wav,"Yangi elektr uzatish liniyasini barpo etish, O...","yangi elektr uzatish liniyasini barpo etish, o...",j ˈa ŋ ɪ <space> ˈe l ɛ k t r <space> u z ˈa t...


## Generate and save vocabulary


In [8]:
from torchtext.vocab import build_vocab_from_iterator
from utils.task import load_vocab, save_vocab
from text.symbols import special_symbols, UNK_ID
from typing import List


def yield_tokens(cleaned_text: List[str]):
    for text in cleaned_text:
        yield text.split()


text_norm = data["cleaned_text"].tolist()
vocab = build_vocab_from_iterator(yield_tokens(text_norm), specials=special_symbols)
vocab.set_default_index(UNK_ID)

vocab_file = f"../vocab_uz.txt"
save_vocab(vocab, vocab_file)

vocab = load_vocab(vocab_file)
print(f"Size of vocabulary: {len(vocab)}")
print(vocab.get_itos())

Size of vocabulary: 90
['<pad>', '<unk>', '<bos>', '<eos>', '<space>', '<laugh>', 'a', 'ɪ', 'l', 'n', 's', 't', 'ˈa', 'd', 'm', 'j', 'k', 'b', 'h', 'r', 'ɹ', 'ˈi', 'ɡ', 'q', 'ˈɪ', 'ɔ', 'ˈɔ', 'z', 'ʔ', 'i', 'ˌa', 'v', '.', 'tʃ', 'o', 'ʊ', 'ŋ', 'ˈo', 'ˈʊ', ',', 'ˌɔ', 'ˌi', 'χ', 'p', 'f', 'ˌu', 'u', 'ˌo', 'e', 'ˈe', 'ˈu', 'ˈɛ', 'ˌɪ', 'ɛ', 'ˌe', 'ˈæ', 'æ', 'ˌʊ', 'ˌɛ', '“', '”', 'ˌæ', 'ts', ':', '(en)', '(uz)', '?', '!', 'ə', 'ˈʌ', 'əl', 'ˌuː', 'ʃ', ';', 'ɐ', '"', 'ˈy', 'x', 'ɯ', 'ˈaɪ', 'ˈɔː', '—', '…', 'oː', 'ɔː', 'c', 'ɟ', 'ˈeɪ', 'ˌɒ', 'ˌɔː']


## Token cleaners

In [9]:
from text import detokenizer

text_norm = data["cleaned_text"].tolist()
for idx, text in enumerate(text_norm):
    temp = tokenizer(text, vocab, token_cleaners, language=hps.data.language)
    assert UNK_ID not in temp, f"Found unknown symbol:\n{text}\n{detokenizer(temp)}"
    text_norm[idx] = temp

text_norm = ["\t".join(map(str, text)) for text in text_norm]
data = data.assign(tokens=text_norm)
data.head()

Unnamed: 0,file,text,normalized_text,cleaned_text,tokens
0,dataset_uz/audio/utt_0000.wav,U do‘kondan non sotib oldi.,u do‘kondan non sotib oldi.,ʊ <space> d ɔ ʔ k ˈɔ n d a n <space> n ˈɔ n <s...,2\t35\t4\t13\t25\t28\t16\t26\t9\t13\t6\t9\t4\t...
1,dataset_uz/audio/utt_0001.wav,Bugun havo juda iliq.,bugun havo juda iliq.,b ˈu ɡ ʊ n <space> h ˈa v ɔ <space> j ˈu d a <...,2\t17\t50\t22\t35\t9\t4\t18\t12\t31\t25\t4\t15...
2,dataset_uz/audio/utt_0002.wav,kVt.s elektr energiya yetkazib bergan.,kilovatt-soniya elektr energiya yetkazib bergan.,k i l ˈo v a t t s o n ˈi j a <space> ˈe l ɛ k...,2\t16\t29\t8\t37\t31\t6\t11\t11\t10\t34\t9\t21...
3,dataset_uz/audio/utt_0003.wav,Joriy yilda bu raqam 2 mlrd kVt.soatni tashkil...,joriy yilda bu raqam ikki milliard kilovatt.so...,j ˈo ɹ ɪ j <space> j ˈɪ l d a <space> b ʊ <spa...,2\t15\t37\t20\t7\t15\t4\t15\t24\t8\t13\t6\t4\t...
4,dataset_uz/audio/utt_0004.wav,"Yangi elektr uzatish liniyasini barpo etish, O...","yangi elektr uzatish liniyasini barpo etish, o...",j ˈa ŋ ɪ <space> ˈe l ɛ k t r <space> u z ˈa t...,2\t15\t12\t36\t7\t4\t49\t8\t53\t16\t11\t19\t4\...


## Save train, val, test filelists


In [10]:
data = data[["file", "tokens"]]
data = data.sample(frac=1).reset_index(drop=True)

data_train = data.iloc[n_val + n_test:]
data_val = data.iloc[:n_val]
data_test = data.iloc[n_val: n_val + n_test]

data_train.to_csv("../filelists/train.txt", sep="|", index=False, header=False)
data_val.to_csv("../filelists/val.txt", sep="|", index=False, header=False)
data_test.to_csv("../filelists/test.txt", sep="|", index=False, header=False)