# Hugging Face Text-to-Speech (TTS) Model Demo

This notebook demonstrates the exact step-by-step process from the assignment requirements.

In [None]:
# Step 1: Install required packages (run in terminal or notebook)
# pip install transformers torch IPython soundfile

In [None]:
# Step 2: Import required libraries
from transformers import VitsModel, AutoTokenizer
import torch
from IPython.display import Audio
import soundfile as sf

In [None]:
# Step 3: Clone and load the pre-trained TTS model from Hugging Face
model = VitsModel.from_pretrained("facebook/mms-tts-vie")  # You may replace this with any compatible TTS model
tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-vie")

In [None]:
# Step 4: Prepare input text
text = "Xin chào anh em đến với bài tập của khoá AI Application Engineer"  # Example text in Vietnamese
print(f"Input text: {text}")

In [None]:
# Step 5: Tokenize the input text
inputs = tokenizer(text, return_tensors="pt")
print("Text tokenized successfully!")

In [None]:
# Step 6: Perform inference to generate the waveform
with torch.no_grad():
    output = model(**inputs).waveform
    
print(f"Waveform shape: {output.shape}")
print(f"Duration: {output.shape[1] / model.config.sampling_rate:.2f} seconds")

In [None]:
# Step 7: Play the generated audio in Jupyter Notebook
Audio(output.numpy(), rate=model.config.sampling_rate)

In [None]:
# Optional: Save audio to file (requires soundfile)
sf.write('output.wav', output.numpy().squeeze(), model.config.sampling_rate)
print("Audio saved as 'output.wav'")