In [23]:
import os
import librosa
from tqdm import tqdm
import torch
import numpy as np
import codecs
from utils.text2seq import text2seq
from layers import TacotronSTFT
import hparams as hp

'''
Data preparation params:
-> csv_file, root_dir, data_dir
-> In hparams:
    -> sampling_rate
-> text2seq (phonemizer)
'''

csv_file = '-DEV/datasets/TITML-IDN-F01-22kHz/metadata.csv'
root_dir = '-DEV/datasets/TITML-IDN-F01-22kHz/wavs'
data_dir = '-DEV/datasets/TITML-IDN-F01-22kHz/preprocess_melTAC_phonEN'

os.makedirs(data_dir, exist_ok = True)

stft = TacotronSTFT()
def get_mel(filename):
    wav, sr = librosa.load(filename, sr=hp.sampling_rate)
    wav = torch.FloatTensor(wav.astype(np.float32))
    
    ### trimming ###
    try:
        start = torch.where(torch.abs(wav)>(torch.abs(wav).max()*0.05))[0][0]
        end = torch.where(torch.abs(wav)>(torch.abs(wav).max()*0.05))[0][-1]
    except:
        pass
    
    ### 50ms silence padding ###
    wav = torch.nn.functional.pad(wav[start:end], (0, hp.sampling_rate//20))
    ### Wav -> Mel conversion ###
    melspec = stft.mel_spectrogram(wav.unsqueeze(0))
    
    return melspec.squeeze(0), wav


if not os.path.exists(f'{data_dir}'):
    os.mkdir(f'{data_dir}')
if not os.path.exists(f'{data_dir}/phone_seq'):
    os.mkdir(f'{data_dir}/phone_seq')
if not os.path.exists(f'{data_dir}/melspectrogram'):
    os.mkdir(f'{data_dir}/melspectrogram')


with codecs.open(csv_file, 'r', 'utf-8') as f:
    for line in tqdm(f.readlines()):
        fname, _, text = line.split("|")
        wav_name = os.path.join(root_dir, fname) + '.wav'
        phone_seq = text2seq(text)
        melspec, wav = get_mel(wav_name)
        np.save(f'{data_dir}/phone_seq/{fname}_sequence.npy', phone_seq)
        np.save(f'{data_dir}/melspectrogram/{fname}_melspectrogram.npy', melspec.numpy())
    
print("FINISH DATA PREPROCESSING!!!")

100%|██████████| 331/331 [01:04<00:00,  5.10it/s]

FINISH DATA PREPROCESSING!!!





### Split Train, Val, Test

In [26]:
metadata = np.loadtxt(csv_file, delimiter = '|', dtype = str)

In [33]:
num_val = 15
num_train = metadata.shape[0] - num_val
num_train, num_val

(316, 15)

In [52]:
indices = np.random.choice([i for i in range(0, len(metadata))], size = len(metadata), replace = False)

In [55]:
train_indices = indices[:num_train]
val_indices = indices[num_train:]
len(train_indices), len(val_indices)

(316, 15)

In [57]:
train_files = metadata[train_indices, :]
val_files = metadata[val_indices, :]

In [63]:
np.savetxt('filelists/TITML-IDN-F01-trainfiles.txt', train_files, fmt = '%s', delimiter = '|')
np.savetxt('filelists/TITML-IDN-F01-valfiles.txt', val_files, fmt = '%s', delimiter = '|')