https://huggingface.co/blog/speecht5

In [None]:
!pip install datasets parallel_wavegan speechbrain sentencepiece
!pip install --upgrade transformers

In [2]:
# Library Imports
import torch
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech
from datasets import load_dataset
import soundfile as sf
import sentencepiece as spm
import torch.nn as nn
import librosa
from transformers import SpeechT5HifiGan
import numpy as np

In [None]:
# Load the embeddings dataset
embeddings_dataset = load_dataset(
    "Matthijs/cmu-arctic-xvectors",
    split="validation"
    )

# Load Processor and Model
model_name = "microsoft/speecht5_tts"
processor = SpeechT5Processor.from_pretrained(model_name)
model = SpeechT5ForTextToSpeech.from_pretrained(model_name)

# Load HiFIGAN Vocoder
# vocoder = HIFIGAN.from_hparams(source="speechbrain/tts-hifigan-ljspeech", savedir="tmpdir_vocoder")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

# Process Text
text = "TTS stands for Text-to-Speech. It is a technology that converts written text \
into spoken words, enabling machines to \"read aloud\" text content. TTS is widely used \
in various applications to improve accessibility, enhance user interaction, and \
enable automation."
input_ids = processor(text=text, return_tensors="pt")

In [4]:
# Select a specific speaker embedding
new_speaker_index = 7930  # Example index, you can choose any valid index
speaker_embeddings = embeddings_dataset[new_speaker_index]["xvector"]
speaker_embeddings = torch.tensor(speaker_embeddings).unsqueeze(0)

# Generate Mel Spectrogram
with torch.no_grad():
    mel_outputs = model.generate_speech(input_ids['input_ids'], speaker_embeddings=speaker_embeddings)

# Generate speech waveform from mel spectrogram using the vocoder
with torch.no_grad():
    speech = vocoder(mel_outputs)

speech = model.generate_speech(input_ids["input_ids"], speaker_embeddings=speaker_embeddings, vocoder=vocoder)
speech = speech.numpy()
speech = speech / np.max(np.abs(speech))
file_name = "TTS" + str(new_speaker_index) + ".wav"
sf.write(file_name, speech, samplerate=16000)

model.safetensors:   0%|          | 0.00/585M [00:00<?, ?B/s]

## Convert .wav to .mp4

In [5]:
from moviepy.editor import *

# Load your audio file
audio_clip = AudioFileClip(file_name)

# Make a black screen of the same duration as the audio clip
video_clip = ColorClip(size=(1920, 1080), color=(0,0,0), duration=audio_clip.duration)

# Set the audio of the video clip as your audio file
video_clip = video_clip.set_audio(audio_clip)

# Write the result to a file
video_clip.write_videofile("video_with_audio.mp4", fps=24)

  if event.key is 'enter':



Moviepy - Building video video_with_audio.mp4.
MoviePy - Writing audio in video_with_audioTEMP_MPY_wvf_snd.mp3




MoviePy - Done.
Moviepy - Writing video video_with_audio.mp4





Moviepy - Done !
Moviepy - video ready video_with_audio.mp4
