In [1]:
import warnings
warnings.filterwarnings("ignore")
import torch
import sys

import os
import numpy as np
from tqdm import tqdm
import torchaudio
import librosa
import glob

sys.path.append("../../cuhksz-phd/sho_util/pyfiles/")
from pytorch import cuda2numpy, cuda2cpu
from basic import plot_spectrogram
from sound import play_audio

sys.path.append("../")
from pyfiles.processsound import trim_audio_and_save

sys.path.append("./../../BigVGAN/")
import bigvgan
from meldataset import get_mel_spectrogram

################################################################################
###          (please add 'export KALDI_ROOT=<your_path>' in your $HOME/.profile)
###          (or run as: KALDI_ROOT=<your_path> python <your_script>.py)
################################################################################



In [2]:
dataset_dir = "/mntcephfs/lee_dataset/tts/LibriTTS_R/"
# feat_base_dir = "/mntcephfs/lab_data/shoinoue/Dataset/LibriTTS_R/features/"
feat_base_dir = "/mntcephfs/data/audiow/shoinoue/Dataset/LibriTTS_R/features/"
fs = 16000

speakers = [os.path.basename(a) for a in glob.glob(dataset_dir + "*/*")]
speakers.sort()

# modelpath = "/mntcephfs/data/audiow/shoinoue/Model/hf_hub/bigvgan/models--nvidia--bigvgan_v2_24khz_100band_256x/snapshots/61df17db326f0876b7201d7a56c831898c836ef4"
modelpath = "/mntcephfs/data/audiow/shoinoue/Model/hf_hub/bigvgan/models--nvidia--bigvgan_v2_22khz_80band_fmax8k_256x/snapshots/189a02ed3b7957e8534b40e6314262df53536ece"
model = bigvgan.BigVGAN.from_pretrained(modelpath, use_cuda_kernel=False)
model.remove_weight_norm()
model = model.eval()

Loading config.json from local directory
Loading weights from local directory
Removing weight norm...


In [45]:
save = True
tempfile = "temp3.wav"
trim_threshold_in_db = 30

files = []
for spk in tqdm(speakers):
    files += glob.glob(dataset_dir + f"*/{spk}/*/*.wav")
files.sort()
for path in tqdm(files):
    trim_audio_and_save(path, savepath=tempfile, trim_threshold_in_db=trim_threshold_in_db)
    wav, sr = librosa.load(tempfile, sr=model.h.sampling_rate, mono=True) 
    wav = torch.FloatTensor(wav).unsqueeze(0)
    mel = get_mel_spectrogram(wav, model.h)
    mel = np.array(mel[0])
    savepath = feat_base_dir + "/".join(path.split("/")[-4:])[:-4] + "_80mel.npy"
    if save:
        np.save(savepath, mel)

100%|██████████| 1230/1230 [00:27<00:00, 45.20it/s]
  8%|▊         | 12149/160267 [13:56<3:37:05, 11.37it/s] 



  9%|▊         | 13659/160267 [15:21<1:35:07, 25.69it/s]



 10%|▉         | 15366/160267 [16:44<4:46:02,  8.44it/s]



 13%|█▎        | 20118/160267 [21:08<2:24:48, 16.13it/s]



 13%|█▎        | 20162/160267 [21:11<2:40:13, 14.57it/s]



 13%|█▎        | 20185/160267 [21:13<1:44:42, 22.30it/s]



 13%|█▎        | 21556/160267 [22:27<2:09:56, 17.79it/s]



 16%|█▋        | 26052/160267 [26:14<1:00:13, 37.14it/s]



 17%|█▋        | 26539/160267 [26:37<1:04:59, 34.29it/s]



 20%|██        | 32285/160267 [31:34<1:11:08, 29.98it/s]



 25%|██▍       | 39485/160267 [38:40<2:32:36, 13.19it/s]



 27%|██▋       | 42689/160267 [41:34<1:08:23, 28.65it/s]



 32%|███▏      | 51939/160267 [49:09<1:51:13, 16.23it/s]



 32%|███▏      | 51957/160267 [49:10<1:50:07, 16.39it/s]



 49%|████▉     | 78640/160267 [1:10:00<55:56, 24.32it/s]  



 49%|████▉     | 78722/160267 [1:10:03<1:19:11, 17.16it/s]



 53%|█████▎    | 85241/160267 [1:14:11<32:48, 38.11it/s]  



 59%|█████▉    | 95161/160267 [1:20:46<28:42, 37.79it/s]  



 62%|██████▏   | 99677/160267 [1:23:39<36:25, 27.72it/s]  



 66%|██████▌   | 105688/160267 [1:27:44<49:14, 18.47it/s]  



 69%|██████▉   | 110351/160267 [1:30:42<27:49, 29.89it/s]  



 81%|████████  | 129400/160267 [1:43:47<24:34, 20.93it/s]  



 96%|█████████▌| 153970/160267 [2:01:08<04:31, 23.17it/s]



100%|██████████| 160267/160267 [2:05:13<00:00, 21.33it/s]


# Normalize

In [46]:
from sklearn.preprocessing import StandardScaler
import numpy as np
import joblib

In [47]:
scaler_name = "LibriTTS-R_80mel"

In [48]:
scaler = StandardScaler()
files = []
for spk in tqdm(speakers):
    files += glob.glob(feat_base_dir + f"train*/{spk}/*/*_80mel.npy")
files.sort()
for path in tqdm(files):
    if "km500" in path:
        continue
    mel = np.load(path)
    scaler.partial_fit(mel.T)
            
scaler_filename = f"ckpts/scalers/{scaler_name}.save"
joblib.dump(scaler, scaler_filename) 
# a = joblib.load(scaler_filename)

100%|██████████| 1230/1230 [00:14<00:00, 83.49it/s]
100%|██████████| 149694/149694 [01:28<00:00, 1699.54it/s]


['ckpts/scalers/LibriTTS-R_80mel.save']