In [3]:
# !pip install Pillow -i https://pypi.tuna.tsinghua.edu.cn/simple
# !pip install matplotlib -i https://pypi.tuna.tsinghua.edu.cn/simple
# !pip install pandas -i https://pypi.tuna.tsinghua.edu.cn/simple
# !pip install fairseq -i https://pypi.tuna.tsinghua.edu.cn/simple

In [1]:
import os
import sys
import torch
import torch.nn.functional as F
import soundfile as sf
import numpy as np
import librosa
import traceback

from tqdm import tqdm
from fairseq import checkpoint_utils

import sys
sys.path.append("../../cuhksz-phd/sho_util/pyfiles/")
from basic import plot_spectrogram

sys.path.append("../")
from pyfiles.utils import trim_audio_and_save

  from .autonotebook import tqdm as notebook_tqdm
################################################################################
###          (please add 'export KALDI_ROOT=<your_path>' in your $HOME/.profile)
###          (or run as: KALDI_ROOT=<your_path> python <your_script>.py)
################################################################################



In [2]:
model_path = "/mntcephfs/lab_data/shoinoue/Models/trained_models/hubert/hubert_base_ls960.pt"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print("loading model(s) from {}".format(model_path))
models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
    [model_path],
    suffix="",
)
model = models[0].half() 
model = model.to(device)
model.eval()

def postprocess(feats, normalize=False):
    if feats.dim() == 2:
        feats = feats.mean(-1)

    assert feats.dim() == 1, feats.dim()

    if normalize:
        with torch.no_grad():
            feats = F.layer_norm(feats, feats.shape)
    return feats

loading model(s) from /mntcephfs/lab_data/shoinoue/Models/trained_models/hubert/hubert_base_ls960.pt


2024-07-26 15:05:59 | INFO | fairseq.tasks.hubert_pretraining | current directory is /home/shoinoue/Git/seq2seq-vc/notebooks
2024-07-26 15:05:59 | INFO | fairseq.tasks.hubert_pretraining | HubertPretrainingTask Config {'_name': 'hubert_pretraining', 'data': '/checkpoint/wnhsu/data/librispeech/960h/iter/250K_50hz_km100_mp0_65_v2', 'fine_tuning': False, 'labels': ['layer6.km500'], 'label_dir': None, 'label_rate': 50.0, 'sample_rate': 16000, 'normalize': False, 'enable_padding': False, 'max_keep_size': None, 'max_sample_size': 250000, 'min_sample_size': 32000, 'single_target': False, 'random_crop': True, 'pad_audio': False}
2024-07-26 15:05:59 | INFO | fairseq.models.hubert.hubert | HubertModel Config: {'_name': 'hubert', 'label_rate': 50.0, 'extractor_mode': default, 'encoder_layers': 12, 'encoder_embed_dim': 768, 'encoder_ffn_embed_dim': 3072, 'encoder_attention_heads': 12, 'activation_fn': gelu, 'layer_type': transformer, 'dropout': 0.1, 'attention_dropout': 0.1, 'activation_dropout': 

In [10]:
### CMU-ARCTIC
# wav_dir = "/mntcephfs/lab_data/shoinoue/Dataset/CMU-ARCTIC/SLT/wav/"
# wav_dir = "/mntcephfs/lab_data/shoinoue/Dataset/PD-AST/SLT/Hindi/wav/"
# output_dir = wav_dir[:-4] + "hubert/"
# add = ""

### LibriTTS-R
wav_dir = "/mntcephfs/lee_dataset/tts/LibriTTS_R/"
output_dir = "/mntcephfs/lab_data/shoinoue/Dataset/LibriTTS_R/features/"
add = "_hubert"


for subdir, dirs, files in tqdm(os.walk(wav_dir)):
    os.makedirs(output_dir + subdir[len(wav_dir):], exist_ok=True)
    for file in files:
# for subdir, dirs, files in os.walk(wav_dir):
#     os.makedirs(output_dir + subdir[len(wav_dir):], exist_ok=True)
#     for file in tqdm(files):
        if file[:2]!="._" and file[-4:]==".wav":
            if subdir[-1]=="/":
                wav_path = subdir + file
            else:
                wav_path = subdir + "/" + file
            savefile = output_dir + wav_path[len(wav_dir):-4] + f"{add}.npy"
            if os.path.exists(savefile):
                continue
            
            tempfile = "temp.wav"
            trim_audio_and_save(wav_path, 16000, tempfile)
            try:
                wav, sr = sf.read(tempfile)
            except Exception as e:
                print("wav load error: ", wav_path)
                traceback.print_exc()
                continue

            if (sr != 16000):
                wav = librosa.resample(wav, orig_sr=sr, target_sr=16000)

            if (wav.shape[0] < 3200 or wav.shape[0] > 30 * 16000):
                print("error : wav too short or too long ", wav_path, wav.shape)
                continue

            feat = torch.from_numpy(wav).float()
            feat = postprocess(feat, normalize=saved_cfg.task.normalize)
            feats = feat.view(1, -1)
            padding_mask = (torch.BoolTensor(feats.shape).fill_(False))

            inputs = {
                "source": feats.half().to(device),
                "padding_mask": padding_mask.to(device),
                "output_layer": 9
            }

            with torch.no_grad():
                logits = model.extract_features(**inputs)

            out_feat = logits[0].squeeze(0)
            out_feat = out_feat.data.detach().cpu().numpy().astype(np.float16)
            np.save(savefile, out_feat) 

109it [00:00, 1086.66it/s]

error : wav too short or too long  /mntcephfs/lee_dataset/tts/LibriTTS_R/test-clean/4507/16021/4507_16021_000025_000000.wav (491139,)
error : wav too short or too long  /mntcephfs/lee_dataset/tts/LibriTTS_R/test-clean/4507/16021/4507_16021_000023_000002.wav (585728,)
error : wav too short or too long  /mntcephfs/lee_dataset/tts/LibriTTS_R/test-clean/7127/75947/7127_75947_000079_000000.wav (3072,)
error : wav too short or too long  /mntcephfs/lee_dataset/tts/LibriTTS_R/test-clean/1089/134686/1089_134686_000008_000000.wav (497152,)


357it [00:01, 221.46it/s] 

error : wav too short or too long  /mntcephfs/lee_dataset/tts/LibriTTS_R/test-clean/7021/79730/7021_79730_000052_000000.wav (517891,)
error : wav too short or too long  /mntcephfs/lee_dataset/tts/LibriTTS_R/train-clean-100/8465/246947/8465_246947_000028_000000.wav (3072,)
error : wav too short or too long  /mntcephfs/lee_dataset/tts/LibriTTS_R/train-clean-100/1743/142912/1743_142912_000015_000001.wav (2560,)


482it [00:01, 329.66it/s]

error : wav too short or too long  /mntcephfs/lee_dataset/tts/LibriTTS_R/train-clean-100/669/129061/669_129061_000040_000004.wav (489731,)


586it [00:02, 227.78it/s]

error : wav too short or too long  /mntcephfs/lee_dataset/tts/LibriTTS_R/train-clean-100/669/129074/669_129074_000028_000000.wav (509059,)
error : wav too short or too long  /mntcephfs/lee_dataset/tts/LibriTTS_R/train-clean-100/26/496/26_496_000029_000000.wav (488963,)
error : wav too short or too long  /mntcephfs/lee_dataset/tts/LibriTTS_R/train-clean-100/26/496/26_496_000026_000000.wav (484739,)


791it [00:03, 213.19it/s]

error : wav too short or too long  /mntcephfs/lee_dataset/tts/LibriTTS_R/train-clean-100/7517/100437/7517_100437_000004_000001.wav (530819,)
error : wav too short or too long  /mntcephfs/lee_dataset/tts/LibriTTS_R/train-clean-100/8095/274345/8095_274345_000033_000004.wav (498176,)


868it [00:04, 169.33it/s]

error : wav too short or too long  /mntcephfs/lee_dataset/tts/LibriTTS_R/train-clean-100/5322/7678/5322_7678_000004_000007.wav (488451,)
error : wav too short or too long  /mntcephfs/lee_dataset/tts/LibriTTS_R/train-clean-100/8238/283452/8238_283452_000006_000000.wav (1920,)


924it [00:04, 163.67it/s]

error : wav too short or too long  /mntcephfs/lee_dataset/tts/LibriTTS_R/train-clean-100/254/145458/254_145458_000013_000001.wav (496640,)


967it [00:05, 145.30it/s]

error : wav too short or too long  /mntcephfs/lee_dataset/tts/LibriTTS_R/train-clean-100/839/130898/839_130898_000011_000001.wav (516611,)


1089it [00:05, 192.17it/s]

error : wav too short or too long  /mntcephfs/lee_dataset/tts/LibriTTS_R/train-clean-100/7780/274562/7780_274562_000005_000006.wav (485376,)
error : wav too short or too long  /mntcephfs/lee_dataset/tts/LibriTTS_R/train-clean-100/78/369/78_369_000035_000003.wav (3072,)


1198it [06:20,  6.03s/it] 

error : wav too short or too long  /mntcephfs/lee_dataset/tts/LibriTTS_R/train-clean-360/1806/143946/1806_143946_000008_000000.wav (515072,)


1291it [11:39,  1.86s/it]

error : wav too short or too long  /mntcephfs/lee_dataset/tts/LibriTTS_R/train-clean-360/7398/98878/7398_98878_000010_000001.wav (1920,)


1292it [11:41,  1.89s/it]

error : wav too short or too long  /mntcephfs/lee_dataset/tts/LibriTTS_R/train-clean-360/7398/98877/7398_98877_000012_000003.wav (2560,)


1330it [13:52,  5.24s/it]

error : wav too short or too long  /mntcephfs/lee_dataset/tts/LibriTTS_R/train-clean-360/1265/135635/1265_135635_000052_000000.wav (3072,)


1413it [21:20,  3.27s/it]

error : wav too short or too long  /mntcephfs/lee_dataset/tts/LibriTTS_R/train-clean-360/6446/40571/6446_40571_000013_000000.wav (645891,)


1418it [21:40,  4.15s/it]

error : wav too short or too long  /mntcephfs/lee_dataset/tts/LibriTTS_R/train-clean-360/7594/91192/7594_91192_000009_000002.wav (503296,)


1433it [22:29,  2.30s/it]

error : wav too short or too long  /mntcephfs/lee_dataset/tts/LibriTTS_R/train-clean-360/2709/158074/2709_158074_000032_000003.wav (1920,)


1458it [24:32,  3.96s/it]

error : wav too short or too long  /mntcephfs/lee_dataset/tts/LibriTTS_R/train-clean-360/5389/4861/5389_4861_000040_000001.wav (531971,)


1490it [25:50,  1.51s/it]

error : wav too short or too long  /mntcephfs/lee_dataset/tts/LibriTTS_R/train-clean-360/1182/134981/1182_134981_000027_000000.wav (2560,)


1536it [28:29,  3.38s/it]

error : wav too short or too long  /mntcephfs/lee_dataset/tts/LibriTTS_R/train-clean-360/2364/131735/2364_131735_000029_000005.wav (494211,)


1561it [30:17,  5.38s/it]

error : wav too short or too long  /mntcephfs/lee_dataset/tts/LibriTTS_R/train-clean-360/7484/39971/7484_39971_000005_000003.wav (513283,)


1592it [32:17,  5.14s/it]

error : wav too short or too long  /mntcephfs/lee_dataset/tts/LibriTTS_R/train-clean-360/6567/53342/6567_53342_000012_000001.wav (565248,)
error : wav too short or too long  /mntcephfs/lee_dataset/tts/LibriTTS_R/train-clean-360/6567/53342/6567_53342_000015_000003.wav (498176,)


1605it [33:12,  3.46s/it]

error : wav too short or too long  /mntcephfs/lee_dataset/tts/LibriTTS_R/train-clean-360/6104/58845/6104_58845_000023_000000.wav (3072,)


1617it [33:59,  3.12s/it]

error : wav too short or too long  /mntcephfs/lee_dataset/tts/LibriTTS_R/train-clean-360/4806/26894/4806_26894_000004_000000.wav (1280,)


1719it [39:02,  3.82s/it]

error : wav too short or too long  /mntcephfs/lee_dataset/tts/LibriTTS_R/train-clean-360/6308/68358/6308_68358_000017_000000.wav (484099,)


1754it [40:59,  3.25s/it]

error : wav too short or too long  /mntcephfs/lee_dataset/tts/LibriTTS_R/train-clean-360/5583/41259/5583_41259_000007_000005.wav (3072,)


1773it [41:51,  2.16s/it]

error : wav too short or too long  /mntcephfs/lee_dataset/tts/LibriTTS_R/train-clean-360/4238/12895/4238_12895_000007_000005.wav (500611,)
error : wav too short or too long  /mntcephfs/lee_dataset/tts/LibriTTS_R/train-clean-360/4238/12895/4238_12895_000006_000002.wav (548739,)


1904it [49:18,  2.81s/it]

error : wav too short or too long  /mntcephfs/lee_dataset/tts/LibriTTS_R/train-clean-360/8479/276730/8479_276730_000026_000000.wav (2560,)
error : wav too short or too long  /mntcephfs/lee_dataset/tts/LibriTTS_R/train-clean-360/8479/276730/8479_276730_000042_000000.wav (1920,)
error : wav too short or too long  /mntcephfs/lee_dataset/tts/LibriTTS_R/train-clean-360/8479/276730/8479_276730_000034_000000.wav (1920,)


1912it [49:56,  4.21s/it]

error : wav too short or too long  /mntcephfs/lee_dataset/tts/LibriTTS_R/train-clean-360/7258/91905/7258_91905_000006_000012.wav (498688,)


2053it [58:11,  5.53s/it]

error : wav too short or too long  /mntcephfs/lee_dataset/tts/LibriTTS_R/train-clean-360/454/134728/454_134728_000113_000000.wav (513411,)
error : wav too short or too long  /mntcephfs/lee_dataset/tts/LibriTTS_R/train-clean-360/454/134728/454_134728_000011_000000.wav (537731,)
error : wav too short or too long  /mntcephfs/lee_dataset/tts/LibriTTS_R/train-clean-360/454/134728/454_134728_000056_000000.wav (511363,)


2096it [1:01:22,  4.05s/it]

error : wav too short or too long  /mntcephfs/lee_dataset/tts/LibriTTS_R/train-clean-360/7437/117899/7437_117899_000011_000000.wav (2560,)


2326it [1:14:34,  5.09s/it]

error : wav too short or too long  /mntcephfs/lee_dataset/tts/LibriTTS_R/train-clean-360/1271/128145/1271_128145_000006_000000.wav (552323,)


2329it [1:14:42,  3.88s/it]

error : wav too short or too long  /mntcephfs/lee_dataset/tts/LibriTTS_R/train-clean-360/1271/133279/1271_133279_000015_000002.wav (480256,)
error : wav too short or too long  /mntcephfs/lee_dataset/tts/LibriTTS_R/train-clean-360/1271/133279/1271_133279_000005_000006.wav (505344,)


2373it [1:17:54,  5.07s/it]

error : wav too short or too long  /mntcephfs/lee_dataset/tts/LibriTTS_R/train-clean-360/1027/125147/1027_125147_000124_000001.wav (3072,)


2563it [1:33:32,  4.92s/it]

error : wav too short or too long  /mntcephfs/lee_dataset/tts/LibriTTS_R/train-clean-360/7139/75362/7139_75362_000002_000002.wav (510979,)


2567it [1:33:51,  5.04s/it]

error : wav too short or too long  /mntcephfs/lee_dataset/tts/LibriTTS_R/train-clean-360/7139/75360/7139_75360_000011_000007.wav (491011,)


2579it [1:34:30,  2.05s/it]

error : wav too short or too long  /mntcephfs/lee_dataset/tts/LibriTTS_R/train-clean-360/1001/134708/1001_134708_000013_000000.wav (589824,)


2661it [1:40:45,  3.83s/it]

error : wav too short or too long  /mntcephfs/lee_dataset/tts/LibriTTS_R/train-clean-360/2240/148521/2240_148521_000010_000001.wav (489091,)


2689it [1:43:19,  4.61s/it]

error : wav too short or too long  /mntcephfs/lee_dataset/tts/LibriTTS_R/train-clean-360/335/125951/335_125951_000017_000002.wav (2560,)


2858it [1:54:41,  4.36s/it]

error : wav too short or too long  /mntcephfs/lee_dataset/tts/LibriTTS_R/train-clean-360/598/127704/598_127704_000016_000007.wav (483840,)


2980it [2:01:59,  2.40s/it]

error : wav too short or too long  /mntcephfs/lee_dataset/tts/LibriTTS_R/train-clean-360/5984/56837/5984_56837_000014_000004.wav (501760,)


2990it [2:02:55,  5.67s/it]

error : wav too short or too long  /mntcephfs/lee_dataset/tts/LibriTTS_R/train-clean-360/7240/108066/7240_108066_000005_000000.wav (526848,)


3136it [2:12:24,  4.41s/it]

error : wav too short or too long  /mntcephfs/lee_dataset/tts/LibriTTS_R/train-clean-360/7647/102250/7647_102250_000005_000000.wav (565248,)
error : wav too short or too long  /mntcephfs/lee_dataset/tts/LibriTTS_R/train-clean-360/7647/102250/7647_102250_000012_000000.wav (482816,)
error : wav too short or too long  /mntcephfs/lee_dataset/tts/LibriTTS_R/train-clean-360/7647/102250/7647_102250_000012_000003.wav (519168,)
error : wav too short or too long  /mntcephfs/lee_dataset/tts/LibriTTS_R/train-clean-360/7647/102250/7647_102250_000010_000006.wav (559104,)


3270it [2:21:16,  3.03s/it]

error : wav too short or too long  /mntcephfs/lee_dataset/tts/LibriTTS_R/train-clean-360/699/132864/699_132864_000009_000000.wav (534019,)


3306it [2:23:27,  4.27s/it]

error : wav too short or too long  /mntcephfs/lee_dataset/tts/LibriTTS_R/train-clean-360/6373/65772/6373_65772_000004_000002.wav (521731,)


3388it [2:28:17,  1.97s/it]

error : wav too short or too long  /mntcephfs/lee_dataset/tts/LibriTTS_R/train-clean-360/8075/279229/8075_279229_000005_000001.wav (487299,)


3411it [2:29:35,  4.34s/it]

error : wav too short or too long  /mntcephfs/lee_dataset/tts/LibriTTS_R/train-clean-360/3032/19361/3032_19361_000041_000001.wav (488451,)


3505it [2:35:34,  3.26s/it]

error : wav too short or too long  /mntcephfs/lee_dataset/tts/LibriTTS_R/train-clean-360/2582/155973/2582_155973_000023_000001.wav (495616,)


3508it [2:35:47,  3.65s/it]

error : wav too short or too long  /mntcephfs/lee_dataset/tts/LibriTTS_R/train-clean-360/899/126232/899_126232_000007_000000.wav (487939,)


3590it [2:41:03,  4.97s/it]

error : wav too short or too long  /mntcephfs/lee_dataset/tts/LibriTTS_R/train-clean-360/3513/7741/3513_7741_000060_000001.wav (2560,)


3610it [2:42:07,  3.29s/it]

error : wav too short or too long  /mntcephfs/lee_dataset/tts/LibriTTS_R/train-clean-360/7515/220979/7515_220979_000011_000001.wav (509571,)


3612it [2:42:13,  3.34s/it]

error : wav too short or too long  /mntcephfs/lee_dataset/tts/LibriTTS_R/train-clean-360/7515/100431/7515_100431_000010_000005.wav (501760,)


3625it [2:42:52,  3.85s/it]

error : wav too short or too long  /mntcephfs/lee_dataset/tts/LibriTTS_R/train-clean-360/3728/105386/3728_105386_000010_000001.wav (2560,)


3644it [2:44:13,  5.19s/it]

error : wav too short or too long  /mntcephfs/lee_dataset/tts/LibriTTS_R/train-clean-360/4363/14936/4363_14936_000008_000000.wav (555011,)


3898it [3:00:12,  4.53s/it]

error : wav too short or too long  /mntcephfs/lee_dataset/tts/LibriTTS_R/train-clean-360/14/212/14_212_000011_000004.wav (3072,)
error : wav too short or too long  /mntcephfs/lee_dataset/tts/LibriTTS_R/train-clean-360/14/212/14_212_000018_000001.wav (2560,)
error : wav too short or too long  /mntcephfs/lee_dataset/tts/LibriTTS_R/train-clean-360/14/212/14_212_000011_000009.wav (3072,)


3971it [3:04:58,  2.79s/it]
