In [1]:
from data import LibriSpeechDataset
from quantizer import Quantizer
import torch
import json

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = LibriSpeechDataset(
    root="datasets",
    split="train",
)

In [3]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
quantizer = Quantizer(DEVICE)

In [4]:
with open("librispeech_normalized_phones.json", "r") as f:
    identifier_to_phones = json.load(f)

In [5]:
identifier = f"{dataset.dataset[2][3]}-{dataset.dataset[2][4]}-{dataset.dataset[2][5]:04d}"
text = dataset[2][-1]
audio = dataset[2][-2]

In [6]:
quantized_indices = quantizer.quantize(audio.unsqueeze(0).to(DEVICE))
quantized_indices_list = quantized_indices.tolist()[0]

In [7]:
import pickle
with open("p_phoneme_quantized_idx.pkl", "rb") as f:
    p_phoneme_quantized_idx = pickle.load(f)

In [8]:
phoneme_converter = {
    "aa": "a",
    "ae": "a",
    "ah": "a",
    "aw": "a",
    "ay": "a",
    "b": "b",
    "ch": "c",
    "d": "d",
    "dh": "d",
    "dx": "t",
    "eh": "e",
    "axr": "er",
    "ey": "e",
    "f": "f",
    "g": "g",
    "hh": "h",
    "ih": "e",
    "iy": "e",
    "jh": "j",
    "k": "k",
    "el": "l",
    "em": "n",
    "en": "n",
    "eng": "n",
    "ow": "o",
    "oy": "o",
    "p": "p",
    "r": "r",
    "s": "s",
    "sh": "s",
    "t": "t",
    "th": "t",
    "uh": "u",
    "uw": "u",
    "v": "v",
    "w": "w",
    "y": "y",
    "z": "z",
    "bcl": "bcl",
    "[UNK]": "[UNK]",
}

In [9]:
phonemes = []
for quantized_idx in quantized_indices_list:
    if quantized_idx in p_phoneme_quantized_idx:
        probs = p_phoneme_quantized_idx[quantized_idx]
        max_phoneme = max(probs, key=probs.get)
        phonemes.append(max_phoneme)
    else:
        phonemes.append("[UNK]")
normalized_phonemes = phonemes
# covert phoneme
#normalized_phonemes = [phoneme_converter[phoneme] for phoneme in phonemes]
# remove redundant phonemes
normalized_phonemes = [normalized_phonemes[0]] + [normalized_phonemes[i] for i in range(1, len(normalized_phonemes)) if normalized_phonemes[i] != normalized_phonemes[i-1]]
# remove bcl and insert " " instead
normalized_phonemes = [phoneme if phoneme != "bcl" else " " for phoneme in normalized_phonemes]


In [12]:
" ".join(normalized_phonemes)

'  f sh f w oy ih ae [UNK] en ow aa ay aa dx t iy y iy uw v [UNK] v ih en eh ih   b w aa axr r ow aa ah f   k   k ih   dx r [UNK] r ae ay aa en em   p hh ae s z   [UNK] em ih s uw ih z dh r ih iy ih   hh en t sh ah ow el v el iy en z s   aa uw ow r ow el   w el eh   dh eh ae ay   t iy y iy uw iy r axr   g ow aa ay r eh   sh f r axr   d y iy s th s ih en   th z s iy y iy ae eh en   ey ih   k p k el aa axr ae axr en   ih iy ey   p axr aa ah axr   el ih el ah en   dh uw w iy uw en el iy ih s z   k aa ay aa ae en em v en sh ih s th z ih ae   em ih iy z s uw ih z s t v r iy ih   sh s ow el ow em   w el s z s iy   iy en iy ih ae aa t k ih ah axr   w ih iy ih en   d iy ih eh oy ah el ah   p   s iy   ih iy [UNK] ae eh ih sh jh ow aa r eh aa   b aw aa ay ae eh ah aa ay en [UNK] ih   ae eh   f v r iy hh th   f k iy eng en eh ey ih   d   p t hh aw ae   hh s z   t   f r ah em ah   r aa axr ih [UNK]   s ih en   sh ow uw el ow ih   dx axr iy ih en ae aw aa ay ae   p  '

In [13]:
" ".join(identifier_to_phones[identifier])

'bcl f aa r en aa t iy v ih en ah b r uh k bcl k uh d r ah en p ae s t em ih s ih z r ey ch ah el el ih en d z d aa r w ih th aw t d uw r ih g aa r d f axr d iy s ah en s iy ae en d d ih k aa r ah em bcl ih t p r aa b ah b el iy w ah z k aa en sh ah s dh ah t em ih s ih z r ey ch ah el w ah z s ih t ih eng ae t hh axr w ih en d ow bcl k iy p ih eng ah sh aa r p ay aa en eh v r iy th ih eng dh ah t p ae s t bcl f axr em b r uh k s ah en d ch ih el d r ah en ah p bcl'