Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions torchaudio/functional/functional.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,8 @@ def griffinlim(
r"""Compute waveform from a linear scale magnitude spectrogram using the Griffin-Lim transformation.

Implementation ported from
:footcite:`brian_mcfee-proc-scipy-2015`, :footcite:`6701851` and :footcite:`1172092`.
*librosa* [:footcite:`brian_mcfee-proc-scipy-2015`], *A fast Griffin-Lim algorithm* [:footcite:`6701851`]
and *Signal estimation from modified short-time Fourier transform* [:footcite:`1172092`].

Args:
specgram (Tensor): A magnitude-only STFT spectrogram of dimension (..., freq, frames)
Expand Down Expand Up @@ -1207,7 +1208,8 @@ def compute_kaldi_pitch(
recompute_frame: int = 500,
snip_edges: bool = True,
) -> torch.Tensor:
"""Extract pitch based on method described in :footcite:`6854049`.
"""Extract pitch based on method described in *A pitch extraction algorithm tuned
for automatic speech recognition* [:footcite:`6854049`].

This function computes the equivalent of `compute-kaldi-pitch-feats` from Kaldi.

Expand Down
4 changes: 3 additions & 1 deletion torchaudio/models/conv_tasnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,9 @@ def forward(self, input: torch.Tensor) -> torch.Tensor:


class ConvTasNet(torch.nn.Module):
"""Conv-TasNet: a fully-convolutional time-domain audio separation network :footcite:`Luo_2019`.
"""Conv-TasNet: a fully-convolutional time-domain audio separation network
*Conv-TasNet: Surpassing Ideal Time–Frequency Magnitude Masking for Speech Separation*
[:footcite:`Luo_2019`].

Args:
num_sources (int): The number of sources to split.
Expand Down
3 changes: 2 additions & 1 deletion torchaudio/models/deepspeech.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:

class DeepSpeech(torch.nn.Module):
"""
DeepSpeech model architecture from :footcite:`hannun2014deep`.
DeepSpeech model architecture from *Deep Speech: Scaling up end-to-end speech recognition*
[:footcite:`hannun2014deep`].

Args:
n_feature: Number of input features
Expand Down
3 changes: 2 additions & 1 deletion torchaudio/models/wav2letter.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@


class Wav2Letter(nn.Module):
r"""Wav2Letter model architecture from :footcite:`collobert2016wav2letter`.
r"""Wav2Letter model architecture from *Wav2Letter: an End-to-End ConvNet-based Speech
Recognition System* [:footcite:`collobert2016wav2letter`].

:math:`\text{padding} = \frac{\text{ceil}(\text{kernel} - \text{stride})}{2}`

Expand Down
8 changes: 4 additions & 4 deletions torchaudio/models/wav2vec2/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@


class Wav2Vec2Model(Module):
"""Encoder model used in [:footcite:`baevski2020wav2vec`].
"""Encoder model used in *wav2vec 2.0* [:footcite:`baevski2020wav2vec`].

Note:
To build the model, please use one of the factory functions.
Expand Down Expand Up @@ -122,7 +122,7 @@ def _get_model(


def wav2vec2_base(num_out: int) -> Wav2Vec2Model:
"""Build wav2vec2.0 model with "Base" configuration from [:footcite:`baevski2020wav2vec`].
"""Build wav2vec2.0 model with "Base" configuration from *wav2vec 2.0* [:footcite:`baevski2020wav2vec`].

Args:
num_out: int
Expand Down Expand Up @@ -164,7 +164,7 @@ def wav2vec2_base(num_out: int) -> Wav2Vec2Model:


def wav2vec2_large(num_out: int) -> Wav2Vec2Model:
"""Build wav2vec2.0 model with "Large" configuration from [:footcite:`baevski2020wav2vec`].
"""Build wav2vec2.0 model with "Large" configuration from *wav2vec 2.0* [:footcite:`baevski2020wav2vec`].

Args:
num_out: int
Expand Down Expand Up @@ -206,7 +206,7 @@ def wav2vec2_large(num_out: int) -> Wav2Vec2Model:


def wav2vec2_large_lv60k(num_out: int) -> Wav2Vec2Model:
"""Build wav2vec2.0 model with "Large LV-60k" configuration from [:footcite:`baevski2020wav2vec`].
"""Build wav2vec2.0 model with "Large LV-60k" configuration from *wav2vec 2.0* [:footcite:`baevski2020wav2vec`].

Args:
num_out: int
Expand Down
8 changes: 4 additions & 4 deletions torchaudio/models/wavernn.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@


class ResBlock(nn.Module):
r"""ResNet block based on :footcite:`kalchbrenner2018efficient`.
r"""ResNet block based on *Efficient Neural Audio Synthesis* [:footcite:`kalchbrenner2018efficient`].

Args:
n_freq: the number of bins in a spectrogram. (Default: ``128``)
Expand Down Expand Up @@ -202,9 +202,9 @@ def forward(self, specgram: Tensor) -> Tuple[Tensor, Tensor]:
class WaveRNN(nn.Module):
r"""WaveRNN model based on the implementation from `fatchord <https://github.com/fatchord/WaveRNN>`_.

The original implementation was introduced in :footcite:`kalchbrenner2018efficient`.
The input channels of waveform and spectrogram have to be 1. The product of
`upsample_scales` must equal `hop_length`.
The original implementation was introduced in *Efficient Neural Audio Synthesis*
[:footcite:`kalchbrenner2018efficient`]. The input channels of waveform and spectrogram have to be 1.
The product of `upsample_scales` must equal `hop_length`.

Args:
upsample_scales: the list of upsample scales.
Expand Down
6 changes: 4 additions & 2 deletions torchaudio/prototype/rnnt_loss.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ def rnnt_loss(
fused_log_softmax: bool = True,
reuse_logits_for_grads: bool = True,
):
"""Compute the RNN Transducer loss from :footcite:`graves2012sequence`.
"""Compute the RNN Transducer loss from *Sequence Transduction with Recurrent Neural Networks*
[:footcite:`graves2012sequence`].

The RNN Transducer loss extends the CTC loss by defining a distribution over output
sequences of all lengths, and by jointly modelling both input-output and output-output
Expand Down Expand Up @@ -57,7 +58,8 @@ def rnnt_loss(


class RNNTLoss(torch.nn.Module):
"""Compute the RNN Transducer loss from :footcite:`graves2012sequence`.
"""Compute the RNN Transducer loss from *Sequence Transduction with Recurrent Neural Networks*
[:footcite:`graves2012sequence`].

The RNN Transducer loss extends the CTC loss by defining a distribution over output
sequences of all lengths, and by jointly modelling both input-output and output-output
Expand Down
3 changes: 2 additions & 1 deletion torchaudio/transforms.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,8 @@ class GriffinLim(torch.nn.Module):
r"""Compute waveform from a linear scale magnitude spectrogram using the Griffin-Lim transformation.

Implementation ported from
:footcite:`brian_mcfee-proc-scipy-2015`, :footcite:`6701851` and :footcite:`1172092`.
*librosa* [:footcite:`brian_mcfee-proc-scipy-2015`], *A fast Griffin-Lim algorithm* [:footcite:`6701851`]
and *Signal estimation from modified short-time Fourier transform* [:footcite:`1172092`].

Args:
n_fft (int, optional): Size of FFT, creates ``n_fft // 2 + 1`` bins. (Default: ``400``)
Expand Down