In [1]:
import torch
from torchaudio.datasets import LIBRISPEECH
from pathlib import Path
import torchaudio.sox_effects as ta_sox
from tqdm import tqdm
import torchaudio.compliance.kaldi as ta_kaldi

In [2]:
from kaldi.feat.fbank import FbankOptions, Fbank
from kaldi.feat.mel import MelBanksOptions
from kaldi.feat.window import FrameExtractionOptions
from kaldi.matrix import Vector

ModuleNotFoundError: No module named 'kaldi'

In [4]:
dataset = LIBRISPEECH(root='data', url='test-clean', download=True)

In [7]:
def _sox_convert(waveform,sample_rate,effects):
    try:
        import torchaudio.sox_effects as ta_sox
    except ImportError:
        raise ImportError("Please install torchaudio to convert audios")
    return ta_sox.apply_effects_tensor(waveform, sample_rate, effects)[0]


def _convert_to_mono(waveform, sample_rate):
    if waveform.shape[0] > 1:
        _waveform = torch.from_numpy(waveform)
        effects = [["channels", "1"]]
        return _sox_convert(_waveform, sample_rate, effects).numpy()
    return waveform

def _get_kaldi_fbank(_waveform, sample_rate, n_bins):
    mel_opts = MelBanksOptions()
    mel_opts.num_bins = n_bins
    frame_opts = FrameExtractionOptions()
    frame_opts.samp_freq = sample_rate
    opts = FbankOptions()
    opts.mel_opts = mel_opts
    opts.frame_opts = frame_opts
    fbank = Fbank(opts=opts)
    features = fbank.compute(Vector(waveform.squeeze()), 1.0).numpy()
    return features

def _get_torchaudio_fbank(_waveform, sample_rate, n_mel_bins):
    waveform = torch.from_numpy(waveform)
    features = ta_kaldi.fbank(
        waveform, num_mel_bins=n_bins, sample_frequency=sample_rate
    )
    return features.numpy()

def extract_fbank_features(wav, sample_rate, output_path, n_mel_bins=80):
    _waveform = _convert_to_mono(wav, sample_rate)
    _waveform = _waveform * (2 ** 15)  # Kaldi compliance: 16-bit signed integers
    _waveform = _waveform.numpy()
    features = _get_kaldi_fbank(_waveform, sample_rate, n_mel_bins)
    if features is None:
        features = _get_torchaudio_fbank(_waveform, sample_rate, n_mel_bins)
        
    if output_path is not None:
        np.save(output_path.as_posix(), features)
    

In [8]:
feature_root = Path('data').absolute() / 'fbank80'
for wav, sample_rate, _, spk_id, chapter_no, utt_no in tqdm(dataset):
    sample_id = f"{spk_id}-{chapter_no}-{utt_no}"
    extract_fbank_features(
        wav, sample_rate, feature_root / f"{sample_id}.npy"
    )

  0%|                                                                                         | 0/2620 [00:00<?, ?it/s]


NameError: name 'MelBanksOptions' is not defined

In [2]:
import torchaudio
str(torchaudio.get_audio_backend())

'soundfile'

In [None]:
state_dict = torch.load('librispeech_transformer_s.pt')

In [None]:
state_dict['last_optimizer_state']

In [None]:
model = state_dict['model']