pytorch · vincentqb · Jun 14, 2021 · Jun 14, 2021
diff --git a/torchaudio/functional/functional.py b/torchaudio/functional/functional.py
@@ -155,7 +155,8 @@ def griffinlim(
     r"""Compute waveform from a linear scale magnitude spectrogram using the Griffin-Lim transformation.
 
     Implementation ported from
-    :footcite:`brian_mcfee-proc-scipy-2015`, :footcite:`6701851` and :footcite:`1172092`.
+    *librosa* [:footcite:`brian_mcfee-proc-scipy-2015`], *A fast Griffin-Lim algorithm* [:footcite:`6701851`]
+    and *Signal estimation from modified short-time Fourier transform* [:footcite:`1172092`].
 
     Args:
         specgram (Tensor): A magnitude-only STFT spectrogram of dimension (..., freq, frames)
@@ -1207,7 +1208,8 @@ def compute_kaldi_pitch(
         recompute_frame: int = 500,
         snip_edges: bool = True,
 ) -> torch.Tensor:
-    """Extract pitch based on method described in :footcite:`6854049`.
+    """Extract pitch based on method described in *A pitch extraction algorithm tuned
+    for automatic speech recognition* [:footcite:`6854049`].
 
     This function computes the equivalent of `compute-kaldi-pitch-feats` from Kaldi.
 

diff --git a/torchaudio/models/conv_tasnet.py b/torchaudio/models/conv_tasnet.py
@@ -164,7 +164,9 @@ def forward(self, input: torch.Tensor) -> torch.Tensor:
 
 
 class ConvTasNet(torch.nn.Module):
-    """Conv-TasNet: a fully-convolutional time-domain audio separation network :footcite:`Luo_2019`.
+    """Conv-TasNet: a fully-convolutional time-domain audio separation network
+    *Conv-TasNet: Surpassing Ideal Time–Frequency Magnitude Masking for Speech Separation*
+    [:footcite:`Luo_2019`].
 
     Args:
         num_sources (int): The number of sources to split.

diff --git a/torchaudio/models/deepspeech.py b/torchaudio/models/deepspeech.py
@@ -31,7 +31,8 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
 class DeepSpeech(torch.nn.Module):
     """
-    DeepSpeech model architecture from :footcite:`hannun2014deep`.
+    DeepSpeech model architecture from *Deep Speech: Scaling up end-to-end speech recognition*
+    [:footcite:`hannun2014deep`].
 
     Args:
         n_feature: Number of input features

diff --git a/torchaudio/models/wav2letter.py b/torchaudio/models/wav2letter.py
@@ -7,7 +7,8 @@
 
 
 class Wav2Letter(nn.Module):
-    r"""Wav2Letter model architecture from :footcite:`collobert2016wav2letter`.
+    r"""Wav2Letter model architecture from *Wav2Letter: an End-to-End ConvNet-based Speech
+    Recognition System* [:footcite:`collobert2016wav2letter`].
 
      :math:`\text{padding} = \frac{\text{ceil}(\text{kernel} - \text{stride})}{2}`
 

diff --git a/torchaudio/models/wav2vec2/model.py b/torchaudio/models/wav2vec2/model.py
@@ -7,7 +7,7 @@
 
 
 class Wav2Vec2Model(Module):
-    """Encoder model used in [:footcite:`baevski2020wav2vec`].
+    """Encoder model used in *wav2vec 2.0* [:footcite:`baevski2020wav2vec`].
 
     Note:
         To build the model, please use one of the factory functions.
@@ -122,7 +122,7 @@ def _get_model(
 
 
 def wav2vec2_base(num_out: int) -> Wav2Vec2Model:
-    """Build wav2vec2.0 model with "Base" configuration from [:footcite:`baevski2020wav2vec`].
+    """Build wav2vec2.0 model with "Base" configuration from *wav2vec 2.0* [:footcite:`baevski2020wav2vec`].
 
     Args:
         num_out: int
@@ -164,7 +164,7 @@ def wav2vec2_base(num_out: int) -> Wav2Vec2Model:
 
 
 def wav2vec2_large(num_out: int) -> Wav2Vec2Model:
-    """Build wav2vec2.0 model with "Large" configuration from [:footcite:`baevski2020wav2vec`].
+    """Build wav2vec2.0 model with "Large" configuration from *wav2vec 2.0* [:footcite:`baevski2020wav2vec`].
 
     Args:
         num_out: int
@@ -206,7 +206,7 @@ def wav2vec2_large(num_out: int) -> Wav2Vec2Model:
 
 
 def wav2vec2_large_lv60k(num_out: int) -> Wav2Vec2Model:
-    """Build wav2vec2.0 model with "Large LV-60k" configuration from [:footcite:`baevski2020wav2vec`].
+    """Build wav2vec2.0 model with "Large LV-60k" configuration from *wav2vec 2.0* [:footcite:`baevski2020wav2vec`].
 
     Args:
         num_out: int

diff --git a/torchaudio/models/wavernn.py b/torchaudio/models/wavernn.py
@@ -14,7 +14,7 @@
 
 
 class ResBlock(nn.Module):
-    r"""ResNet block based on :footcite:`kalchbrenner2018efficient`.
+    r"""ResNet block based on *Efficient Neural Audio Synthesis* [:footcite:`kalchbrenner2018efficient`].
 
     Args:
         n_freq: the number of bins in a spectrogram. (Default: ``128``)
@@ -202,9 +202,9 @@ def forward(self, specgram: Tensor) -> Tuple[Tensor, Tensor]:
 class WaveRNN(nn.Module):
     r"""WaveRNN model based on the implementation from `fatchord <https://github.com/fatchord/WaveRNN>`_.
 
-    The original implementation was introduced in :footcite:`kalchbrenner2018efficient`.
-    The input channels of waveform and spectrogram have to be 1. The product of
-    `upsample_scales` must equal `hop_length`.
+    The original implementation was introduced in *Efficient Neural Audio Synthesis*
+    [:footcite:`kalchbrenner2018efficient`]. The input channels of waveform and spectrogram have to be 1.
+    The product of `upsample_scales` must equal `hop_length`.
 
     Args:
         upsample_scales: the list of upsample scales.

diff --git a/torchaudio/prototype/rnnt_loss.py b/torchaudio/prototype/rnnt_loss.py
@@ -17,7 +17,8 @@ def rnnt_loss(
     fused_log_softmax: bool = True,
     reuse_logits_for_grads: bool = True,
 ):
-    """Compute the RNN Transducer loss from :footcite:`graves2012sequence`.
+    """Compute the RNN Transducer loss from *Sequence Transduction with Recurrent Neural Networks*
+    [:footcite:`graves2012sequence`].
 
     The RNN Transducer loss extends the CTC loss by defining a distribution over output
     sequences of all lengths, and by jointly modelling both input-output and output-output
@@ -57,7 +58,8 @@ def rnnt_loss(
 
 
 class RNNTLoss(torch.nn.Module):
-    """Compute the RNN Transducer loss from :footcite:`graves2012sequence`.
+    """Compute the RNN Transducer loss from *Sequence Transduction with Recurrent Neural Networks*
+    [:footcite:`graves2012sequence`].
 
     The RNN Transducer loss extends the CTC loss by defining a distribution over output
     sequences of all lengths, and by jointly modelling both input-output and output-output

diff --git a/torchaudio/transforms.py b/torchaudio/transforms.py
@@ -129,7 +129,8 @@ class GriffinLim(torch.nn.Module):
     r"""Compute waveform from a linear scale magnitude spectrogram using the Griffin-Lim transformation.
 
     Implementation ported from
-    :footcite:`brian_mcfee-proc-scipy-2015`, :footcite:`6701851` and :footcite:`1172092`.
+    *librosa* [:footcite:`brian_mcfee-proc-scipy-2015`], *A fast Griffin-Lim algorithm* [:footcite:`6701851`]
+    and *Signal estimation from modified short-time Fourier transform* [:footcite:`1172092`].
 
     Args:
         n_fft (int, optional): Size of FFT, creates ``n_fft // 2 + 1`` bins. (Default: ``400``)