In [5]:
# !pip install -U encodec
# !pip install -U noisereduce
# !pip install -U git+https://git@github.com/facebookresearch/encodec#egg=encodec  # bleeding edge

In [12]:
import warnings
warnings.filterwarnings("ignore")

from encodec import EncodecModel
from encodec.utils import convert_audio

import torchaudio
import torch
import glob
import os
import numpy as np
from tqdm import tqdm

import sys
sys.path.append("../sho_util/pyfiles/")
from sound import play_audio

In [10]:
from g2p_en import G2p

g2p = G2p()
def get_phonemes(text):
    phonemes = np.array(g2p(text))
    phonemes = phonemes[phonemes!=" "]
    phonemes = phonemes[phonemes!=","]
    phonemes = phonemes[phonemes!="."]
    phonemes = phonemes[phonemes!="!"]
    phonemes = phonemes[phonemes!="?"]
    return phonemes

# Preparation

In [4]:
dataset_dir = "../../Dataset/Blizzard2013/"
wav_dir = dataset_dir + "BC2013_segmented_v1_wav_selection/"
txt_dir = dataset_dir + "BC2013_segmented_v1_txt_selection/"
pho_dir = dataset_dir + "BC2013_segmented_v1_phonemes_selection/"
enc_dir = dataset_dir + "BC2013_segmented_v1_encodec_selection/"
bw = 24.0

wavlist = glob.glob(wav_dir+"*/*.wav")
wavlist.sort()
txtlist = glob.glob(txt_dir+"*/*.txt")
txtlist.sort()

# Instantiate a pretrained EnCodec model
model = EncodecModel.encodec_model_24khz()
# The number of codebooks used will be determined bythe bandwidth selected.
# E.g. for a bandwidth of 6kbps, `n_q = 8` codebooks are used.
# Supported bandwidths are 1.5kbps (n_q = 2), 3 kbps (n_q = 4), 6 kbps (n_q = 8) and 12 kbps (n_q =16) and 24kbps (n_q=32).
# For the 48 kHz model, only 3, 6, 12, and 24 kbps are supported. The number
# of codebooks for each is half that of the 24 kHz model as the frame rate is twice as much.
sr = model.sample_rate
model.set_target_bandwidth(bw)

# src = wav_dir[:-1]
# for dst in [pho_dir[:-1], enc_dir[:-1]]:
#     if not os.path.exists(dst):
#         os.makedirs(dst)
#     for root, dirs, files in os.walk(src):
#         for dir in dirs:
#             src_path = os.path.join(root, dir)
#             dst_path = src_path.replace(src, dst, 1)

#             os.makedirs(dst_path, exist_ok=True)

In [15]:
num = len(txtlist)
for idx in tqdm(range(num)):
    # clear_output(wait=True)
    # print(f"{idx+1} / {num}")
    
    txt = txtlist[idx]
    f = open(txt, "r")
    text = f.read()
    phonemes = get_phonemes(text)

    wav = wavlist[idx]
    wav, fs = torchaudio.load(wav)
    wav = convert_audio(wav, fs, model.sample_rate, model.channels)
    wav = wav.unsqueeze(0)
    x = wav.numpy()[0][0]
    with torch.no_grad():
        encoded_frames = model.encode(wav)
    codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1)  # [B, n_q, T]

    pho_sp = (pho_dir + "/".join(txt.split("/")[-2:]))[:-4] + ".npy"
    enc_sp = (enc_dir + "/".join(txt.split("/")[-2:]))[:-4] + ".npy"
    np.save(pho_sp, phonemes)
    np.save(enc_sp, codes)

100%|██████████| 40565/40565 [2:35:35<00:00,  4.35it/s]  


In [16]:
reconstructed_wav = model.decode(encoded_frames)
y = reconstructed_wav.detach().cpu().numpy()[0][0]
play_audio(x, sr)
play_audio(y, sr)