Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 0 additions & 4 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -1,7 +1,3 @@
[submodule "third_party/warp_transducer/submodule"]
path = third_party/transducer/submodule
url = https://github.com/HawkAaron/warp-transducer
ignore = dirty
[submodule "kaldi"]
path = third_party/kaldi/submodule
url = https://github.com/kaldi-asr/kaldi
Expand Down
8 changes: 4 additions & 4 deletions docs/source/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ Features described in this documentation are classified by release status:
*Prototype:* These features are typically not available as part of
binary distributions like PyPI or Conda, except sometimes behind run-time
flags, and are at an early stage for feedback and testing.


The :mod:`torchaudio` package consists of I/O, popular datasets and common audio transformations.

Expand All @@ -39,9 +39,9 @@ The :mod:`torchaudio` package consists of I/O, popular datasets and common audio
compliance.kaldi
kaldi_io
utils
transducer
rnnt_loss


.. toctree::
:maxdepth: 1
:caption: PyTorch Libraries
Expand Down
6 changes: 3 additions & 3 deletions docs/source/transducer.rst → docs/source/rnnt_loss.rst
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
.. role:: hidden
:class: hidden-section

torchaudio.prototype.transducer
torchaudio.prototype.rnnt_loss
===============================

.. currentmodule:: torchaudio.prototype.transducer
.. currentmodule:: torchaudio.prototype.rnnt_loss

.. note::

The RNN transducer loss is a prototype feature, see `here <https://pytorch.org/audio>`_ to learn more about the nomenclature. It is only available within the nightlies, and also needs to be imported explicitly using: :code:`from torchaudio.prototype.transducer import rnnt_loss, RNNTLoss`.
The RNN transducer loss is a prototype feature, see `here <https://pytorch.org/audio>`_ to learn more about the nomenclature. It is only available within the nightlies, and also needs to be imported explicitly using: :code:`from torchaudio.prototype.rnnt_loss import rnnt_loss, RNNTLoss`.

rnnt_loss
---------
Expand Down
2 changes: 1 addition & 1 deletion examples/libtorchaudio/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ SET(BUILD_LIBTORCHAUDIO ON CACHE BOOL "Build libtorchaudio")
SET(BUILD_SOX ON CACHE BOOL "Build libsox into libtorchaudio")

SET(BUILD_KALDI OFF CACHE BOOL "Build Kaldi into libtorchaudio")
SET(BUILD_TRANSDUCER OFF CACHE BOOL "Build Python binding")
SET(BUILD_TRANSDUCER OFF CACHE BOOL "Build transducer into libtorchaudio")
SET(BUILD_TORCHAUDIO_PYTHON_EXTENSION OFF CACHE BOOL "Build Python binding")

find_package(Torch REQUIRED)
Expand Down
Empty file.
167 changes: 167 additions & 0 deletions test/torchaudio_unittest/rnnt/numpy_transducer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
import numpy as np
import torch


class _NumpyTransducer(torch.autograd.Function):
@staticmethod
def forward(
ctx,
log_probs,
logit_lengths,
target_lengths,
targets,
blank=-1,
):
device = log_probs.device
log_probs = log_probs.cpu().data.numpy()
logit_lengths = logit_lengths.cpu().data.numpy()
target_lengths = target_lengths.cpu().data.numpy()
targets = targets.cpu().data.numpy()

gradients, costs, _, _ = __class__.compute(
log_probs=log_probs,
logit_lengths=logit_lengths,
target_lengths=target_lengths,
targets=targets,
blank=blank,
)

costs = torch.FloatTensor(costs).to(device=device)
gradients = torch.FloatTensor(gradients).to(device=device)
ctx.grads = torch.autograd.Variable(gradients)

return costs

@staticmethod
def backward(ctx, output_gradients):
return ctx.grads, None, None, None, None, None, None, None, None

@staticmethod
def compute_alpha_one_sequence(log_probs, targets, blank=-1):
max_T, max_U, D = log_probs.shape
alpha = np.zeros((max_T, max_U), dtype=np.float32)
for t in range(1, max_T):
alpha[t, 0] = alpha[t - 1, 0] + log_probs[t - 1, 0, blank]

for u in range(1, max_U):
alpha[0, u] = alpha[0, u - 1] + log_probs[0, u - 1, targets[u - 1]]

for t in range(1, max_T):
for u in range(1, max_U):
skip = alpha[t - 1, u] + log_probs[t - 1, u, blank]
emit = alpha[t, u - 1] + log_probs[t, u - 1, targets[u - 1]]
alpha[t, u] = np.logaddexp(skip, emit)

cost = -(alpha[-1, -1] + log_probs[-1, -1, blank])
return alpha, cost

@staticmethod
def compute_beta_one_sequence(log_probs, targets, blank=-1):
max_T, max_U, D = log_probs.shape
beta = np.zeros((max_T, max_U), dtype=np.float32)
beta[-1, -1] = log_probs[-1, -1, blank]

for t in reversed(range(max_T - 1)):
beta[t, -1] = beta[t + 1, -1] + log_probs[t, -1, blank]

for u in reversed(range(max_U - 1)):
beta[-1, u] = beta[-1, u + 1] + log_probs[-1, u, targets[u]]

for t in reversed(range(max_T - 1)):
for u in reversed(range(max_U - 1)):
skip = beta[t + 1, u] + log_probs[t, u, blank]
emit = beta[t, u + 1] + log_probs[t, u, targets[u]]
beta[t, u] = np.logaddexp(skip, emit)

cost = -beta[0, 0]
return beta, cost

@staticmethod
def compute_gradients_one_sequence(
log_probs, alpha, beta, targets, blank=-1
):
max_T, max_U, D = log_probs.shape
gradients = np.full(log_probs.shape, float("-inf"))
cost = -beta[0, 0]

gradients[-1, -1, blank] = alpha[-1, -1]

gradients[:-1, :, blank] = alpha[:-1, :] + beta[1:, :]

for u, l in enumerate(targets):
gradients[:, u, l] = alpha[:, u] + beta[:, u + 1]

gradients = -(np.exp(gradients + log_probs + cost))
return gradients

@staticmethod
def compute(
log_probs,
logit_lengths,
target_lengths,
targets,
blank=-1,
):
gradients = np.zeros_like(log_probs)
B_tgt, max_T, max_U, D = log_probs.shape
B_src = logit_lengths.shape[0]

H = int(B_tgt / B_src)

alphas = np.zeros((B_tgt, max_T, max_U))
betas = np.zeros((B_tgt, max_T, max_U))
betas.fill(float("-inf"))
alphas.fill(float("-inf"))
costs = np.zeros(B_tgt)
for b_tgt in range(B_tgt):
b_src = int(b_tgt / H)
T = int(logit_lengths[b_src])
# NOTE: see https://arxiv.org/pdf/1211.3711.pdf Section 2.1
U = int(target_lengths[b_tgt]) + 1

seq_log_probs = log_probs[b_tgt, :T, :U, :]
seq_targets = targets[b_tgt, : int(target_lengths[b_tgt])]
alpha, alpha_cost = __class__.compute_alpha_one_sequence(
log_probs=seq_log_probs, targets=seq_targets, blank=blank
)

beta, beta_cost = __class__.compute_beta_one_sequence(
log_probs=seq_log_probs, targets=seq_targets, blank=blank
)

seq_gradients = __class__.compute_gradients_one_sequence(
log_probs=seq_log_probs,
alpha=alpha,
beta=beta,
targets=seq_targets,
blank=blank,
)
np.testing.assert_almost_equal(alpha_cost, beta_cost, decimal=2)
gradients[b_tgt, :T, :U, :] = seq_gradients
costs[b_tgt] = beta_cost
alphas[b_tgt, :T, :U] = alpha
betas[b_tgt, :T, :U] = beta

return gradients, costs, alphas, betas


class NumpyTransducerLoss(torch.nn.Module):
def __init__(self, blank=-1):
super().__init__()
self.blank = blank

def forward(
self,
logits,
logit_lengths,
target_lengths,
targets,
):
log_probs = torch.nn.functional.log_softmax(logits, dim=-1)
return _NumpyTransducer.apply(
log_probs,
logit_lengths,
target_lengths,
targets,
self.blank,
)
9 changes: 9 additions & 0 deletions test/torchaudio_unittest/rnnt/rnnt_loss_cpu_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
import torch
from torchaudio_unittest import common_utils
from .utils import skipIfNoTransducer
from .rnnt_loss_impl import RNNTLossTest


@skipIfNoTransducer
class TestRNNTLoss(RNNTLossTest, common_utils.PytorchTestCase):
device = torch.device('cpu')
116 changes: 116 additions & 0 deletions test/torchaudio_unittest/rnnt/rnnt_loss_impl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
import numpy as np
from torchaudio.prototype.rnnt_loss import RNNTLoss

from .utils import (
compute_with_numpy_transducer,
compute_with_pytorch_transducer,
get_B1_T10_U3_D4_data,
get_data_basic,
get_numpy_data_B1_T2_U3_D5,
get_numpy_data_B2_T4_U3_D3,
get_numpy_random_data,
numpy_to_torch,
)


class RNNTLossTest:
def _test_costs_and_gradients(
self, data, ref_costs, ref_gradients, atol=1e-6, rtol=1e-2
):
logits_shape = data["logits"].shape
for reuse_logits_for_grads in [False, True]:
with self.subTest(reuse_logits_for_grads=reuse_logits_for_grads):
costs, gradients = compute_with_pytorch_transducer(
data=data, reuse_logits_for_grads=reuse_logits_for_grads
)
np.testing.assert_allclose(costs, ref_costs, atol=atol, rtol=rtol)
self.assertEqual(logits_shape, gradients.shape)
if not np.allclose(gradients, ref_gradients, atol=atol, rtol=rtol):
for b in range(len(gradients)):
T = data["logit_lengths"][b]
U = data["target_lengths"][b]
for t in range(gradients.shape[1]):
for u in range(gradients.shape[2]):
np.testing.assert_allclose(
gradients[b, t, u],
ref_gradients[b, t, u],
atol=atol,
rtol=rtol,
err_msg=f"failed on b={b}, t={t}/T={T}, u={u}/U={U}",
)

def test_basic_backward(self):
rnnt_loss = RNNTLoss()
logits, targets, logit_lengths, target_lengths = get_data_basic(self.device)
loss = rnnt_loss(logits, targets, logit_lengths, target_lengths)
loss.backward()

def test_costs_and_gradients_B1_T2_U3_D5_fp32(self):
data, ref_costs, ref_gradients = get_numpy_data_B1_T2_U3_D5(
dtype=np.float32
)
data = numpy_to_torch(data=data, device=self.device, requires_grad=True)
self._test_costs_and_gradients(
data=data, ref_costs=ref_costs, ref_gradients=ref_gradients
)

def test_costs_and_gradients_B1_T2_U3_D5_fp16(self):
data, ref_costs, ref_gradients = get_numpy_data_B1_T2_U3_D5(
dtype=np.float16
)
data = numpy_to_torch(data=data, device=self.device, requires_grad=True)
self._test_costs_and_gradients(
data=data,
ref_costs=ref_costs,
ref_gradients=ref_gradients,
atol=1e-3,
rtol=1e-2,
)

def test_costs_and_gradients_B2_T4_U3_D3_fp32(self):
data, ref_costs, ref_gradients = get_numpy_data_B2_T4_U3_D3(
dtype=np.float32
)
data = numpy_to_torch(data=data, device=self.device, requires_grad=True)
self._test_costs_and_gradients(
data=data, ref_costs=ref_costs, ref_gradients=ref_gradients
)

def test_costs_and_gradients_B2_T4_U3_D3_fp16(self):
data, ref_costs, ref_gradients = get_numpy_data_B2_T4_U3_D3(
dtype=np.float16
)
data = numpy_to_torch(data=data, device=self.device, requires_grad=True)
self._test_costs_and_gradients(
data=data,
ref_costs=ref_costs,
ref_gradients=ref_gradients,
atol=1e-3,
rtol=1e-2,
)

def test_costs_and_gradients_random_data_with_numpy_fp32(self):
seed = 777
for i in range(5):
data = get_numpy_random_data(dtype=np.float32, seed=(seed + i))
data = numpy_to_torch(data=data, device=self.device, requires_grad=True)
ref_costs, ref_gradients = compute_with_numpy_transducer(data=data)
self._test_costs_and_gradients(
data=data, ref_costs=ref_costs, ref_gradients=ref_gradients
)

def test_rnnt_nonfused_log_softmax(self):
for random in [False, True]:
data = get_B1_T10_U3_D4_data(
random=random,
)
data = numpy_to_torch(
data=data, device=self.device, requires_grad=True
)
data["fused_log_softmax"] = False
ref_costs, ref_gradients = compute_with_numpy_transducer(
data=data
)
self._test_costs_and_gradients(
data=data, ref_costs=ref_costs, ref_gradients=ref_gradients
)
Loading