# svara-tts-v1: Multilingual Indic Text-to-Speech


High-quality text-to-speech for **19 languages** (18 Indic + English) optimized for clarity, expressiveness, and low latency.


<center>


[![kenpath.ai](https://img.shields.io/badge/kenpath.ai-Visit-0ea5e9?style=for-the-badge)](https://kenpath.ai/)


[![ü§ó Hugging Face - svara-tts-v1 Model](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Model-black)](https://huggingface.co/kenpath/svara-tts-v1)
[![ü§ó Hugging Face - Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-green)](https://huggingface.co/spaces/kenpath/svara-tts)
[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/15YxFo1DzdQNbFUIZ1HJA4AN4oHqKxGtg)
[![GitHub](https://img.shields.io/badge/github-%23121011.svg?style=flat&logo=github&logoColor=white)](https://github.com/Kenpath/svara-tts-inference)



In [1]:
#@title ‚öôÔ∏è Install packages
%%capture
!pip install snac ipywidgets torch transformers

In [2]:
#@title üîß Audio Generation Utilities
%%capture

model_name = "kenpath/svara-tts-v1"

from snac import SNAC
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from IPython.display import display, Audio

# Find out device - cuda or mps or cpu
device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")

snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
snac_model = snac_model.to(device)

model = AutoModelForCausalLM.from_pretrained(model_name)
model = model.to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)
print("‚úì Model loaded successfully")

def generate_audio_from_text(text, language, gender):
    """
    Generate audio from text using the Svara-TTS model.

    Args:
        text (str): The text to synthesize into speech
        language (str): The language name (e.g., 'Hindi', 'Bengali', 'English')
        gender (str): The gender of the voice ('Male' or 'Female')

    Returns:
        numpy.ndarray: Audio waveform array at 24kHz sample rate
    """

    # Format the prompt for Svara-TTS
    voice = f"{language} ({gender})"
    formatted_text = f"<|audio|> {voice}: {text}<|eot_id|>"
    prompt = "<custom_token_3>" + formatted_text + "<custom_token_4><custom_token_5>"

    # Tokenize the prompt
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids

    # Add special tokens
    start_token = torch.tensor([[128259]], dtype=torch.int64)
    end_tokens = torch.tensor([[128009, 128260, 128261, 128257]], dtype=torch.int64)

    modified_input_ids = torch.cat([start_token, input_ids, end_tokens], dim=1)

    # Move to device
    input_ids = modified_input_ids.to(device)

    # Generate speech tokens
    with torch.no_grad():
        generated_ids = model.generate(
            input_ids=input_ids,
            max_new_tokens=800,
            do_sample=True,
            temperature=0.7,
            top_p=0.95,
            repetition_penalty=1.2,
            num_return_sequences=1,
            eos_token_id=128258,
        )

    # Parse output tokens to extract SNAC codes
    START_OF_SPEECH_TOKEN = 128257
    END_OF_SPEECH_TOKEN = 128258
    AUDIO_CODE_BASE_OFFSET = 128266
    AUDIO_CODE_MAX = AUDIO_CODE_BASE_OFFSET + (7 * 4096) - 1

    row = generated_ids[0]
    token_indices = (row == START_OF_SPEECH_TOKEN).nonzero(as_tuple=True)[0]

    if len(token_indices) > 0:
        start_idx = token_indices[-1].item() + 1
        audio_tokens = row[start_idx:]
        audio_tokens = audio_tokens[audio_tokens != END_OF_SPEECH_TOKEN]
        audio_tokens = audio_tokens[audio_tokens != 128263]  # PAD token

        # Only keep valid SNAC tokens
        valid_mask = (audio_tokens >= AUDIO_CODE_BASE_OFFSET) & (audio_tokens <= AUDIO_CODE_MAX)
        audio_tokens = audio_tokens[valid_mask]

        snac_tokens = audio_tokens.tolist()
        snac_tokens = [t - AUDIO_CODE_BASE_OFFSET for t in snac_tokens]

        # Trim to multiple of 7
        new_length = (len(snac_tokens) // 7) * 7
        snac_tokens = snac_tokens[:new_length]
    else:
        raise ValueError("No speech tokens found in generated output")

    # Redistribute codes into hierarchical levels for SNAC decoder
    def redistribute_codes(code_list):
        """De-interleave SNAC tokens into 3 hierarchical levels"""
        codes_lvl = [[] for _ in range(3)]
        llm_codebook_offsets = [i * 4096 for i in range(7)]

        for i in range(0, len(code_list), 7):
            # Level 0: Coarse
            codes_lvl[0].append(code_list[i] - llm_codebook_offsets[0])
            # Level 1: Medium
            codes_lvl[1].append(code_list[i+1] - llm_codebook_offsets[1])
            codes_lvl[1].append(code_list[i+4] - llm_codebook_offsets[4])
            # Level 2: Fine
            codes_lvl[2].append(code_list[i+2] - llm_codebook_offsets[2])
            codes_lvl[2].append(code_list[i+3] - llm_codebook_offsets[3])
            codes_lvl[2].append(code_list[i+5] - llm_codebook_offsets[5])
            codes_lvl[2].append(code_list[i+6] - llm_codebook_offsets[6])

        # Convert to tensors for SNAC decoder
        hierarchical_codes = []
        for lvl_codes in codes_lvl:
            tensor = torch.tensor(lvl_codes, dtype=torch.long, device=device).unsqueeze(0)
            hierarchical_codes.append(tensor)

        # Decode with SNAC
        with torch.no_grad():
            audio_hat = snac_model.decode(hierarchical_codes)

        return audio_hat

    # Generate audio waveform
    audio_waveform = redistribute_codes(snac_tokens)

    # Convert to numpy array
    audio_array = audio_waveform.detach().squeeze().to("cpu").numpy()

    return audio_array

print("‚úì Audio generation utilities loaded successfully")

In [3]:
#@title üé§ svara Text-to-Speech
from ipywidgets import Textarea, Dropdown, VBox, HBox, Button, Output, HTML
from IPython.display import display, Audio

# --------------------------
# Widgets
# --------------------------
text_input = Textarea(
    value="‡§Ü‡§ú ‡§ï‡§æ ‡§Æ‡•å‡§∏‡§Æ ‡§¨‡§π‡•Å‡§§ ‡§Ö‡§ö‡•ç‡§õ‡§æ ‡§π‡•à‡•§ ‡§Ü‡§™ ‡§ï‡•à‡§∏‡•á ‡§π‡•à‡§Ç? <happy>",
    placeholder='Enter your text here',
    layout={'width': '100%', 'height': '90px'},
)

language_dropdown = Dropdown(
    options=[
        'Hindi', 'Bengali', 'Marathi', 'Telugu', 'Kannada',
        'Bhojpuri', 'Magahi', 'Chhattisgarhi', 'Maithili',
        'Assamese', 'Bodo', 'Dogri', 'Gujarati', 'Malayalam',
        'Punjabi', 'Tamil', 'English', 'Nepali', 'Sanskrit'
    ],
    value='Hindi'
)
language_dropdown.layout.width = "100%"

gender_dropdown = Dropdown(
    options=['Female', 'Male'],
    value='Female'
)
gender_dropdown.layout.width = "100%"

submit_button = Button(
    description='üéôÔ∏è Generate Speech',
    button_style='success',
    layout={'width': '100%', 'height': '42px',
            'margin': '8px 0 0 0'}
)

output_area = Output()

# --------------------------
# Logic
# --------------------------
def on_submit_clicked(b):
    with output_area:
        output_area.clear_output()
        print("üîÑ Generating audio...")
        try:
            audio_array = generate_audio_from_text(
                text=text_input.value,
                language=language_dropdown.value,
                gender=gender_dropdown.value
            )
            output_area.clear_output()
            print(f"‚úÖ Generated | {language_dropdown.value} ({gender_dropdown.value})")
            print(f"‚è± {len(audio_array) / 24000:.2f} sec")
            display(Audio(audio_array, rate=24000, autoplay=True))
        except Exception as e:
            output_area.clear_output()
            print(f"‚ùå Error: {str(e)}")

submit_button.on_click(on_submit_clicked)

# --------------------------
# UI Sections
# --------------------------
controls = HBox([
    VBox([
        HTML('<label style="font-size:13px; font-weight:600; color:#ddd;">üåê Language</label>'),
        language_dropdown
    ], layout={'width': '48%'}),

    VBox([
        HTML('<label style="font-size:13px; font-weight:600; color:#ddd;">üßç‚Äç‚ôÄÔ∏èüßç‚Äç‚ôÇÔ∏è Gender</label>'),
        gender_dropdown
    ], layout={'width': '48%'})
],
layout={
    'width': '100%',
    'justify_content': 'space-between',
    'margin': '6px 0'
})

app_card = VBox([
    HTML('<h3 style="margin:0;color:#eee;">üé§ svara Text-to-Speech Generator</h3>'),
    HTML('<p style="color:#aaa;font-size:12px;margin:2px 0 10px;">Enter text with <tags> for emotion</p>'),

    HTML('<label style="font-size:13px;font-weight:600;color:#ddd;">üìù Text</label>'),
    text_input,

    controls,
    submit_button,

    HTML('<hr style="margin:14px 0;border:0;border-top:1px solid #333;">'),
    output_area
],
layout={
    'padding': '22px',
    'border': '1px solid #333',
    'border_radius': '10px',
    'width': '600px',
    'max_width': '100%',
})

# --------------------------
# Display
# --------------------------
display(app_card)

VBox(children=(HTML(value='<h3 style="margin:0;color:#eee;">üé§ svara Text-to-Speech Generator</h3>'), HTML(valu‚Ä¶