In [29]:
import torch
import torchaudio
from IPython.display import Audio

# GPU 사용 가능 여부 확인
device = "cuda" if torch.cuda.is_available() else "cpu"

# 음성으로 변환할 텍스트
text = "Hello, I'm a student studying computer science."

# 1. WaveRNN Vocoder 사용
# WaveRNN Vocoder는 별도의 OS 제약 사항이 없습니다.
print("Generating speech with WaveRNN Vocoder...")
bundle_wavernn = torchaudio.pipelines.TACOTRON2_WAVERNN_PHONE_LJSPEECH
processor_wavernn = bundle_wavernn.get_text_processor()
tacotron2_wavernn = bundle_wavernn.get_tacotron2().to(device)
vocoder_wavernn = bundle_wavernn.get_vocoder().to(device)

with torch.inference_mode():
    processed_wavernn, lengths_wavernn = processor_wavernn(text)
    processed_wavernn = processed_wavernn.to(device)
    lengths_wavernn = lengths_wavernn.to(device)
    spec_wavernn, spec_lengths_wavernn, _ = tacotron2_wavernn.infer(processed_wavernn, lengths_wavernn)
    waveforms_wavernn, lengths_wavernn = vocoder_wavernn(spec_wavernn, spec_lengths_wavernn)

# 생성된 음성 파일 저장
torchaudio.save("wavernn_output.wav", waveforms_wavernn[0:1].cpu(), sample_rate=vocoder_wavernn.sample_rate)
print("WaveRNN output saved as wavernn_output.wav")


# 2. Griffin-Lim Vocoder 사용
# Griffin-Lim Vocoder는 별도의 OS 제약 사항이 없습니다.
print("\nGenerating speech with Griffin-Lim Vocoder...")
bundle_griffinlim = torchaudio.pipelines.TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH
processor_griffinlim = bundle_griffinlim.get_text_processor()
tacotron2_griffinlim = bundle_griffinlim.get_tacotron2().to(device)
vocoder_griffinlim = bundle_griffinlim.get_vocoder().to(device)

with torch.inference_mode():
    processed_griffinlim, lengths_griffinlim = processor_griffinlim(text)
    processed_griffinlim = processed_griffinlim.to(device)
    lengths_griffinlim = lengths_griffinlim.to(device)
    spec_griffinlim, spec_lengths_griffinlim, _ = tacotron2_griffinlim.infer(processed_griffinlim, lengths_griffinlim)
    waveforms_griffinlim, lengths_griffinlim = vocoder_griffinlim(spec_griffinlim, spec_lengths_griffinlim)

# 생성된 음성 파일 저장
torchaudio.save("griffinlim_output.wav", waveforms_griffinlim[0:1].cpu(), sample_rate=vocoder_griffinlim.sample_rate)
print("Griffin-Lim output saved as griffinlim_output.wav")


# 3. Waveglow Vocoder 사용
# Waveglow Vocoder는 Nvidia에서 개발하였으며, 별도의 OS 제약 사항이 없습니다.
print("\nGenerating speech with Waveglow Vocoder...")
waveglow = torch.hub.load(
    "NVIDIA/DeepLearningExamples:torchhub",
    "nvidia_waveglow",
    model_math="fp32",
    pretrained=True,
    trust_repo=True,
)
waveglow = waveglow.remove_weightnorm(waveglow)
waveglow = waveglow.to(device)
waveglow.eval()

# Griffin-Lim에서 생성된 spectrogram을 사용합니다.
with torch.no_grad():
    waveforms_waveglow = waveglow.infer(spec_griffinlim)

# 생성된 음성 파일 저장
torchaudio.save("waveglow_output.wav", waveforms_waveglow[0:1].cpu(), sample_rate=22050)
print("Waveglow output saved as waveglow_output.wav")

Generating speech with WaveRNN Vocoder...
Downloading: "https://download.pytorch.org/torchaudio/models/tacotron2_english_phonemes_1500_epochs_wavernn_ljspeech.pth" to /root/.cache/torch/hub/checkpoints/tacotron2_english_phonemes_1500_epochs_wavernn_ljspeech.pth


100%|██████████| 107M/107M [00:02<00:00, 49.7MB/s]


Downloading: "https://download.pytorch.org/torchaudio/models/wavernn_10k_epochs_8bits_ljspeech.pth" to /root/.cache/torch/hub/checkpoints/wavernn_10k_epochs_8bits_ljspeech.pth


100%|██████████| 16.7M/16.7M [00:00<00:00, 26.0MB/s]
  s = torchaudio.io.StreamWriter(uri, format=muxer, buffer_size=buffer_size)


WaveRNN output saved as wavernn_output.wav

Generating speech with Griffin-Lim Vocoder...
Downloading: "https://download.pytorch.org/torchaudio/models/tacotron2_english_phonemes_1500_epochs_ljspeech.pth" to /root/.cache/torch/hub/checkpoints/tacotron2_english_phonemes_1500_epochs_ljspeech.pth


100%|██████████| 107M/107M [00:02<00:00, 47.9MB/s]


Griffin-Lim output saved as griffinlim_output.wav

Generating speech with Waveglow Vocoder...


Using cache found in /root/.cache/torch/hub/NVIDIA_DeepLearningExamples_torchhub
Downloading checkpoint from https://api.ngc.nvidia.com/v2/models/nvidia/waveglow_ckpt_fp32/versions/19.09.0/files/nvidia_waveglowpyt_fp32_20190427
  WeightNorm.apply(module, name, dim)


Waveglow output saved as waveglow_output.wav
