-
Notifications
You must be signed in to change notification settings - Fork 634
/
hifigan_pipeline.py
228 lines (197 loc) · 9.43 KB
/
hifigan_pipeline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
from dataclasses import dataclass
from typing import Any, Dict, Optional
import torch
import torch.nn.functional as F
from torch.nn import Module
from torchaudio._internal import load_state_dict_from_url
from torchaudio.prototype.models.hifi_gan import hifigan_vocoder, HiFiGANVocoder
from torchaudio.transforms import MelSpectrogram
@dataclass
class HiFiGANVocoderBundle:
"""Data class that bundles associated information to use pretrained
:py:class:`~torchaudio.prototype.models.HiFiGANVocoder`.
This class provides interfaces for instantiating the pretrained model along with
the information necessary to retrieve pretrained weights and additional data
to be used with the model.
Torchaudio library instantiates objects of this class, each of which represents
a different pretrained model. Client code should access pretrained models via these
instances.
This bundle can convert mel spectrorgam to waveforms and vice versa. A typical use case would be a flow like
`text -> mel spectrogram -> waveform`, where one can use an external component, e.g. Tacotron2,
to generate mel spectrogram from text. Please see below for the code example.
Example: Transform synthetic mel spectrogram to audio.
>>> import torch
>>> import torchaudio
>>> # Since HiFiGAN bundle is in prototypes, it needs to be exported explicitly
>>> from torchaudio.prototype.pipelines import HIFIGAN_VOCODER_V3_LJSPEECH as bundle
>>>
>>> # Load the HiFiGAN bundle
>>> vocoder = bundle.get_vocoder()
Downloading: "https://download.pytorch.org/torchaudio/models/hifigan_vocoder_v3_ljspeech.pth"
100%|████████████| 5.59M/5.59M [00:00<00:00, 18.7MB/s]
>>>
>>> # Generate synthetic mel spectrogram
>>> specgram = torch.sin(0.5 * torch.arange(start=0, end=100)).expand(bundle._vocoder_params["in_channels"], 100)
>>>
>>> # Transform mel spectrogram into audio
>>> waveform = vocoder(specgram)
>>> torchaudio.save('sample.wav', waveform, bundle.sample_rate)
Example: Usage together with Tacotron2, text to audio.
>>> import torch
>>> import torchaudio
>>> # Since HiFiGAN bundle is in prototypes, it needs to be exported explicitly
>>> from torchaudio.prototype.pipelines import HIFIGAN_VOCODER_V3_LJSPEECH as bundle_hifigan
>>>
>>> # Load Tacotron2 bundle
>>> bundle_tactron2 = torchaudio.pipelines.TACOTRON2_WAVERNN_CHAR_LJSPEECH
>>> processor = bundle_tactron2.get_text_processor()
>>> tacotron2 = bundle_tactron2.get_tacotron2()
>>>
>>> # Use Tacotron2 to convert text to mel spectrogram
>>> text = "A quick brown fox jumped over a lazy dog"
>>> input, lengths = processor(text)
>>> specgram, lengths, _ = tacotron2.infer(input, lengths)
>>>
>>> # Load HiFiGAN bundle
>>> vocoder = bundle_hifigan.get_vocoder()
Downloading: "https://download.pytorch.org/torchaudio/models/hifigan_vocoder_v3_ljspeech.pth"
100%|████████████| 5.59M/5.59M [00:03<00:00, 1.55MB/s]
>>>
>>> # Use HiFiGAN to convert mel spectrogram to audio
>>> waveform = vocoder(specgram).squeeze(0)
>>> torchaudio.save('sample.wav', waveform, bundle_hifigan.sample_rate)
""" # noqa: E501
_path: str
_vocoder_params: Dict[str, Any] # Vocoder parameters
_mel_params: Dict[str, Any] # Mel transformation parameters
_sample_rate: float
def _get_state_dict(self, dl_kwargs):
url = f"https://download.pytorch.org/torchaudio/models/{self._path}"
dl_kwargs = {} if dl_kwargs is None else dl_kwargs
state_dict = load_state_dict_from_url(url, **dl_kwargs)
return state_dict
def get_vocoder(self, *, dl_kwargs=None) -> HiFiGANVocoder:
"""Construct the HiFiGAN Generator model, which can be used a vocoder, and load the pretrained weight.
The weight file is downloaded from the internet and cached with
:func:`torch.hub.load_state_dict_from_url`
Args:
dl_kwargs (dictionary of keyword arguments): Passed to :func:`torch.hub.load_state_dict_from_url`.
Returns:
Variation of :py:class:`~torchaudio.prototype.models.HiFiGANVocoder`.
"""
model = hifigan_vocoder(**self._vocoder_params)
model.load_state_dict(self._get_state_dict(dl_kwargs))
model.eval()
return model
def get_mel_transform(self) -> Module:
"""Construct an object which transforms waveforms into mel spectrograms."""
return _HiFiGANMelSpectrogram(
n_mels=self._vocoder_params["in_channels"],
sample_rate=self._sample_rate,
**self._mel_params,
)
@property
def sample_rate(self):
"""Sample rate of the audio that the model is trained on.
:type: float
"""
return self._sample_rate
class _HiFiGANMelSpectrogram(torch.nn.Module):
"""
Generate mel spectrogram in a way equivalent to the original HiFiGAN implementation:
https://github.com/jik876/hifi-gan/blob/4769534d45265d52a904b850da5a622601885777/meldataset.py#L49-L72
This class wraps around :py:class:`torchaudio.transforms.MelSpectrogram`, but performs extra steps to achive
equivalence with the HiFiGAN implementation.
Args:
hop_size (int): Length of hop between STFT windows.
n_fft (int): Size of FFT, creates ``n_fft // 2 + 1`` bins.
win_length (int): Window size.
f_min (float or None): Minimum frequency.
f_max (float or None): Maximum frequency.
sample_rate (int): Sample rate of audio signal.
n_mels (int): Number of mel filterbanks.
"""
def __init__(
self,
hop_size: int,
n_fft: int,
win_length: int,
f_min: Optional[float],
f_max: Optional[float],
sample_rate: float,
n_mels: int,
):
super(_HiFiGANMelSpectrogram, self).__init__()
self.mel_transform = MelSpectrogram(
sample_rate=sample_rate,
n_fft=n_fft,
win_length=win_length,
hop_length=hop_size,
f_min=f_min,
f_max=f_max,
n_mels=n_mels,
normalized=False,
pad=0,
mel_scale="slaney",
norm="slaney",
center=False,
)
self.sample_rate = sample_rate
self.hop_size = hop_size
self.n_fft = n_fft
self.win_length = win_length
self.f_min = f_min
self.f_max = f_max
self.n_mels = n_mels
self.pad_size = int((n_fft - hop_size) / 2)
def forward(self, waveform: torch.Tensor) -> torch.Tensor:
"""Generate mel spectrogram from a waveform. Should have same sample rate as ``self.sample_rate``.
Args:
waveform (Tensor): waveform of shape ``(batch_size, time_length)``.
Returns:
Tensor of shape ``(batch_size, n_mel, time_length)``
"""
ref_waveform = F.pad(waveform.unsqueeze(1), (self.pad_size, self.pad_size), mode="reflect")
ref_waveform = ref_waveform.squeeze(1)
spectr = (self.mel_transform.spectrogram(ref_waveform) + 1e-9) ** 0.5
mel_spectrogram = self.mel_transform.mel_scale(spectr)
mel_spectrogram = torch.log(torch.clamp(mel_spectrogram, min=1e-5))
return mel_spectrogram
HIFIGAN_VOCODER_V3_LJSPEECH = HiFiGANVocoderBundle(
"hifigan_vocoder_v3_ljspeech.pth",
_vocoder_params={
"upsample_rates": (8, 8, 4),
"upsample_kernel_sizes": (16, 16, 8),
"upsample_initial_channel": 256,
"resblock_kernel_sizes": (3, 5, 7),
"resblock_dilation_sizes": ((1, 2), (2, 6), (3, 12)),
"resblock_type": 2,
"in_channels": 80,
"lrelu_slope": 0.1,
},
_mel_params={
"hop_size": 256,
"n_fft": 1024,
"win_length": 1024,
"f_min": 0,
"f_max": 8000,
},
_sample_rate=22050,
)
HIFIGAN_VOCODER_V3_LJSPEECH.__doc__ = """HiFiGAN Vocoder pipeline, trained on *The LJ Speech Dataset*
:cite:`ljspeech17`.
This pipeine can be used with an external component which generates mel spectrograms from text, for example,
Tacotron2 - see examples in :py:class:`HiFiGANVocoderBundle`.
Although this works with the existing Tacotron2 bundles, for the best results one needs to retrain Tacotron2
using the same data preprocessing pipeline which was used for training HiFiGAN. In particular, the original
HiFiGAN implementation uses a custom method of generating mel spectrograms from waveforms, different from
:py:class:`torchaudio.transforms.MelSpectrogram`. We reimplemented this transform as
:py:meth:`HiFiGANVocoderBundle.get_mel_transform`, making sure it is equivalent to the original HiFiGAN code `here
<https://github.com/jik876/hifi-gan/blob/4769534d45265d52a904b850da5a622601885777/meldataset.py#L49-L72>`_.
The underlying vocoder is constructed by
:py:func:`torchaudio.prototype.models.hifigan_vocoder`. The weights are converted from the ones published
with the original paper :cite:`NEURIPS2020_c5d73680` under `MIT License
<https://github.com/jik876/hifi-gan/blob/4769534d45265d52a904b850da5a622601885777/LICENSE>`__. See links to
pre-trained models on `GitHub <https://github.com/jik876/hifi-gan#pretrained-model>`__.
Please refer to :py:class:`HiFiGANVocoderBundle` for usage instructions.
"""