diff --git a/torchaudio/functional/functional.py b/torchaudio/functional/functional.py index f9b1d79baa..2f1bd60961 100644 --- a/torchaudio/functional/functional.py +++ b/torchaudio/functional/functional.py @@ -155,7 +155,8 @@ def griffinlim( r"""Compute waveform from a linear scale magnitude spectrogram using the Griffin-Lim transformation. Implementation ported from - :footcite:`brian_mcfee-proc-scipy-2015`, :footcite:`6701851` and :footcite:`1172092`. + *librosa* [:footcite:`brian_mcfee-proc-scipy-2015`], *A fast Griffin-Lim algorithm* [:footcite:`6701851`] + and *Signal estimation from modified short-time Fourier transform* [:footcite:`1172092`]. Args: specgram (Tensor): A magnitude-only STFT spectrogram of dimension (..., freq, frames) @@ -1207,7 +1208,8 @@ def compute_kaldi_pitch( recompute_frame: int = 500, snip_edges: bool = True, ) -> torch.Tensor: - """Extract pitch based on method described in :footcite:`6854049`. + """Extract pitch based on method described in *A pitch extraction algorithm tuned + for automatic speech recognition* [:footcite:`6854049`]. This function computes the equivalent of `compute-kaldi-pitch-feats` from Kaldi. diff --git a/torchaudio/models/conv_tasnet.py b/torchaudio/models/conv_tasnet.py index c9a88191ae..74652f764d 100644 --- a/torchaudio/models/conv_tasnet.py +++ b/torchaudio/models/conv_tasnet.py @@ -164,7 +164,9 @@ def forward(self, input: torch.Tensor) -> torch.Tensor: class ConvTasNet(torch.nn.Module): - """Conv-TasNet: a fully-convolutional time-domain audio separation network :footcite:`Luo_2019`. + """Conv-TasNet: a fully-convolutional time-domain audio separation network + *Conv-TasNet: Surpassing Ideal Time–Frequency Magnitude Masking for Speech Separation* + [:footcite:`Luo_2019`]. Args: num_sources (int): The number of sources to split. diff --git a/torchaudio/models/deepspeech.py b/torchaudio/models/deepspeech.py index e325275278..41efc07d9e 100644 --- a/torchaudio/models/deepspeech.py +++ b/torchaudio/models/deepspeech.py @@ -31,7 +31,8 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: class DeepSpeech(torch.nn.Module): """ - DeepSpeech model architecture from :footcite:`hannun2014deep`. + DeepSpeech model architecture from *Deep Speech: Scaling up end-to-end speech recognition* + [:footcite:`hannun2014deep`]. Args: n_feature: Number of input features diff --git a/torchaudio/models/wav2letter.py b/torchaudio/models/wav2letter.py index c47c6f4aed..4d93e74392 100644 --- a/torchaudio/models/wav2letter.py +++ b/torchaudio/models/wav2letter.py @@ -7,7 +7,8 @@ class Wav2Letter(nn.Module): - r"""Wav2Letter model architecture from :footcite:`collobert2016wav2letter`. + r"""Wav2Letter model architecture from *Wav2Letter: an End-to-End ConvNet-based Speech + Recognition System* [:footcite:`collobert2016wav2letter`]. :math:`\text{padding} = \frac{\text{ceil}(\text{kernel} - \text{stride})}{2}` diff --git a/torchaudio/models/wav2vec2/model.py b/torchaudio/models/wav2vec2/model.py index 9c6bb06599..62400ebc45 100644 --- a/torchaudio/models/wav2vec2/model.py +++ b/torchaudio/models/wav2vec2/model.py @@ -7,7 +7,7 @@ class Wav2Vec2Model(Module): - """Encoder model used in [:footcite:`baevski2020wav2vec`]. + """Encoder model used in *wav2vec 2.0* [:footcite:`baevski2020wav2vec`]. Note: To build the model, please use one of the factory functions. @@ -122,7 +122,7 @@ def _get_model( def wav2vec2_base(num_out: int) -> Wav2Vec2Model: - """Build wav2vec2.0 model with "Base" configuration from [:footcite:`baevski2020wav2vec`]. + """Build wav2vec2.0 model with "Base" configuration from *wav2vec 2.0* [:footcite:`baevski2020wav2vec`]. Args: num_out: int @@ -164,7 +164,7 @@ def wav2vec2_base(num_out: int) -> Wav2Vec2Model: def wav2vec2_large(num_out: int) -> Wav2Vec2Model: - """Build wav2vec2.0 model with "Large" configuration from [:footcite:`baevski2020wav2vec`]. + """Build wav2vec2.0 model with "Large" configuration from *wav2vec 2.0* [:footcite:`baevski2020wav2vec`]. Args: num_out: int @@ -206,7 +206,7 @@ def wav2vec2_large(num_out: int) -> Wav2Vec2Model: def wav2vec2_large_lv60k(num_out: int) -> Wav2Vec2Model: - """Build wav2vec2.0 model with "Large LV-60k" configuration from [:footcite:`baevski2020wav2vec`]. + """Build wav2vec2.0 model with "Large LV-60k" configuration from *wav2vec 2.0* [:footcite:`baevski2020wav2vec`]. Args: num_out: int diff --git a/torchaudio/models/wavernn.py b/torchaudio/models/wavernn.py index 3763821ad6..89c1e9d430 100644 --- a/torchaudio/models/wavernn.py +++ b/torchaudio/models/wavernn.py @@ -14,7 +14,7 @@ class ResBlock(nn.Module): - r"""ResNet block based on :footcite:`kalchbrenner2018efficient`. + r"""ResNet block based on *Efficient Neural Audio Synthesis* [:footcite:`kalchbrenner2018efficient`]. Args: n_freq: the number of bins in a spectrogram. (Default: ``128``) @@ -202,9 +202,9 @@ def forward(self, specgram: Tensor) -> Tuple[Tensor, Tensor]: class WaveRNN(nn.Module): r"""WaveRNN model based on the implementation from `fatchord `_. - The original implementation was introduced in :footcite:`kalchbrenner2018efficient`. - The input channels of waveform and spectrogram have to be 1. The product of - `upsample_scales` must equal `hop_length`. + The original implementation was introduced in *Efficient Neural Audio Synthesis* + [:footcite:`kalchbrenner2018efficient`]. The input channels of waveform and spectrogram have to be 1. + The product of `upsample_scales` must equal `hop_length`. Args: upsample_scales: the list of upsample scales. diff --git a/torchaudio/prototype/rnnt_loss.py b/torchaudio/prototype/rnnt_loss.py index 0765ea2dcd..60246e6857 100644 --- a/torchaudio/prototype/rnnt_loss.py +++ b/torchaudio/prototype/rnnt_loss.py @@ -17,7 +17,8 @@ def rnnt_loss( fused_log_softmax: bool = True, reuse_logits_for_grads: bool = True, ): - """Compute the RNN Transducer loss from :footcite:`graves2012sequence`. + """Compute the RNN Transducer loss from *Sequence Transduction with Recurrent Neural Networks* + [:footcite:`graves2012sequence`]. The RNN Transducer loss extends the CTC loss by defining a distribution over output sequences of all lengths, and by jointly modelling both input-output and output-output @@ -57,7 +58,8 @@ def rnnt_loss( class RNNTLoss(torch.nn.Module): - """Compute the RNN Transducer loss from :footcite:`graves2012sequence`. + """Compute the RNN Transducer loss from *Sequence Transduction with Recurrent Neural Networks* + [:footcite:`graves2012sequence`]. The RNN Transducer loss extends the CTC loss by defining a distribution over output sequences of all lengths, and by jointly modelling both input-output and output-output diff --git a/torchaudio/transforms.py b/torchaudio/transforms.py index dbd0cd2495..eb4eef46be 100644 --- a/torchaudio/transforms.py +++ b/torchaudio/transforms.py @@ -129,7 +129,8 @@ class GriffinLim(torch.nn.Module): r"""Compute waveform from a linear scale magnitude spectrogram using the Griffin-Lim transformation. Implementation ported from - :footcite:`brian_mcfee-proc-scipy-2015`, :footcite:`6701851` and :footcite:`1172092`. + *librosa* [:footcite:`brian_mcfee-proc-scipy-2015`], *A fast Griffin-Lim algorithm* [:footcite:`6701851`] + and *Signal estimation from modified short-time Fourier transform* [:footcite:`1172092`]. Args: n_fft (int, optional): Size of FFT, creates ``n_fft // 2 + 1`` bins. (Default: ``400``)