In [6]:
import torch
from transformers import VitsModel, AutoTokenizer
from IPython.display import Audio

print("Setting up model and tokenizer...")

# model loading and tokenizer (ONE TIME)
model = VitsModel.from_pretrained("facebook/mms-tts-hin")
tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-hin")

print("✅ Setup complete. You can now run the next cell.")

Setting up model and tokenizer...
✅ Setup complete. You can now run the next cell.


In [13]:
# --- Run this every time ---
text = "अब यह बहुत तेजी से काम करेगा क्योंकि मॉडल पहले से ही लोड हो चुका है।"

#text = "आज सप्ताह की नई शुरुआत होने के नाते, शहर के लोग अपने-अपने दफ्तरों की ओर जा रहे हैं, इसलिए प्रमुख सड़कों पर सामान्य से अधिक यातायात दिखाई दे रहा है।"

inputs = tokenizer(text, return_tensors="pt")
with torch.no_grad():
    output = model(**inputs).waveform

# for jupy notebook only
sampling_rate = model.config.sampling_rate
display(Audio(output.squeeze().numpy(), rate=sampling_rate))

# rough


In [4]:
import torch
from transformers import VitsModel, AutoTokenizer
import scipy.io.wavfile

# --- 1. Load Model and Tokenizer ---
# This will download the model and tokenizer from Hugging Face.

model = VitsModel.from_pretrained("facebook/mms-tts-hin")
tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-hin")

# --- 2. Define Your Hindi Text ---
# The model handles long sentences well.
text = "नमस्ते, गूगल द्वारा बनाए गए जेमिनी में आपका स्वागत है। यह मॉडल लंबे वाक्यों को भी आसानी से ऑडियो में बदल सकता है।"

# --- 3. Prepare Text for the Model (tokenizer) ---
inputs = tokenizer(text, return_tensors="pt")

# --- 4. Generate Speech ---
# The model generates the audio waveform from your tokenized text.
# We use torch.no_grad() for faster inference (we aren't training the model).
with torch.no_grad():
    output = model(**inputs).waveform

# --- 5. Save the Audio ---
# The output is a PyTorch tensor, so we convert it to a NumPy array and save it as a .wav file.
# sampling_rate = model.config.sampling_rate
# scipy.io.wavfile.write("hindi_speech_output.wav", rate=sampling_rate, data=output.squeeze().numpy())

# If you are in a Jupyter Notebook, you can also play it directly:
from IPython.display import Audio
sampling_rate = model.config.sampling_rate
Audio(output.squeeze().numpy(), rate=sampling_rate)

print(f"✅ Audio successfully saved as 'hindi_speech_output.wav'")
print(f"Audio sampling rate: {sampling_rate} Hz")

✅ Audio successfully saved as 'hindi_speech_output.wav'
Audio sampling rate: 16000 Hz
