<a href="https://colab.research.google.com/github/shuklaji28/TTS/blob/main/Veena.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -U transformers
# !pip install transformers torch torchaudio
# !pip install -U snac bitsandbytes  # For audio decoding and quantization
!pip install -U bitsandbytes

Collecting transformers
  Downloading transformers-4.53.1-py3-none-any.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40.9 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.53.1-py3-none-any.whl (10.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m24.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.53.0
    Uninstalling transformers-4.53.0:
      Successfully uninstalled transformers-4.53.0
Successfully installed transformers-4.53.1




In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from snac import SNAC
import soundfile as sf

# Model configuration for 4-bit inference
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    "maya-research/veena-tts",
    quantization_config=quantization_config,
    device_map="auto",
    trust_remote_code=True,
)



model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.58G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/180 [00:00<?, ?B/s]

In [None]:
tokenizer = AutoTokenizer.from_pretrained("maya-research/veena-tts", trust_remote_code=True)

# Initialize SNAC decoder
snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").eval().cuda()

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/22.9M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/697 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/300 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/79.5M [00:00<?, ?B/s]

In [None]:
# Control token IDs (fixed for Veena)
START_OF_SPEECH_TOKEN = 128257
END_OF_SPEECH_TOKEN = 128258
START_OF_HUMAN_TOKEN = 128259
END_OF_HUMAN_TOKEN = 128260
START_OF_AI_TOKEN = 128261
END_OF_AI_TOKEN = 128262
AUDIO_CODE_BASE_OFFSET = 128266

# Available speakers
speakers = ["kavya", "agastya", "maitri", "vinaya"]

def generate_speech(text, speaker="kavya", temperature=0.4, top_p=0.9):
    """Generate speech from text using specified speaker voice"""

    # Prepare input with speaker token
    prompt = f"<spk_{speaker}> {text}"
    prompt_tokens = tokenizer.encode(prompt, add_special_tokens=False)

    # Construct full sequence: [HUMAN] <spk_speaker> text [/HUMAN] [AI] [SPEECH]
    input_tokens = [
        START_OF_HUMAN_TOKEN,
        *prompt_tokens,
        END_OF_HUMAN_TOKEN,
        START_OF_AI_TOKEN,
        START_OF_SPEECH_TOKEN
    ]

    input_ids = torch.tensor([input_tokens], device=model.device)

    # Calculate max tokens based on text length
    max_tokens = min(int(len(text) * 1.3) * 7 + 21, 700)

    # Generate audio tokens
    with torch.no_grad():
        output = model.generate(
            input_ids,
            max_new_tokens=max_tokens,
            do_sample=True,
            temperature=temperature,
            top_p=top_p,
            repetition_penalty=1.05,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=[END_OF_SPEECH_TOKEN, END_OF_AI_TOKEN]
        )

    # Extract SNAC tokens
    generated_ids = output[0][len(input_tokens):].tolist()
    snac_tokens = [
        token_id for token_id in generated_ids
        if AUDIO_CODE_BASE_OFFSET <= token_id < (AUDIO_CODE_BASE_OFFSET + 7 * 4096)
    ]

    if not snac_tokens:
        raise ValueError("No audio tokens generated")

    # Decode audio
    audio = decode_snac_tokens(snac_tokens, snac_model)
    return audio

def decode_snac_tokens(snac_tokens, snac_model):
    """De-interleave and decode SNAC tokens to audio"""
    if not snac_tokens or len(snac_tokens) % 7 != 0:
        return None

    # Get the device of the SNAC model
    snac_device = next(snac_model.parameters()).device

    # De-interleave tokens into 3 hierarchical levels
    codes_lvl = [[] for _ in range(3)]
    llm_codebook_offsets = [AUDIO_CODE_BASE_OFFSET + i * 4096 for i in range(7)]

    for i in range(0, len(snac_tokens), 7):
        # Level 0: Coarse (1 token)
        codes_lvl[0].append(snac_tokens[i] - llm_codebook_offsets[0])
        # Level 1: Medium (2 tokens)
        codes_lvl[1].append(snac_tokens[i+1] - llm_codebook_offsets[1])
        codes_lvl[1].append(snac_tokens[i+4] - llm_codebook_offsets[4])
        # Level 2: Fine (4 tokens)
        codes_lvl[2].append(snac_tokens[i+2] - llm_codebook_offsets[2])
        codes_lvl[2].append(snac_tokens[i+3] - llm_codebook_offsets[3])
        codes_lvl[2].append(snac_tokens[i+5] - llm_codebook_offsets[5])
        codes_lvl[2].append(snac_tokens[i+6] - llm_codebook_offsets[6])

    # Convert to tensors for SNAC decoder
    hierarchical_codes = []
    for lvl_codes in codes_lvl:
        tensor = torch.tensor(lvl_codes, dtype=torch.int32, device=snac_device).unsqueeze(0)
        if torch.any((tensor < 0) | (tensor > 4095)):
            raise ValueError("Invalid SNAC token values")
        hierarchical_codes.append(tensor)

    # Decode with SNAC
    with torch.no_grad():
        audio_hat = snac_model.decode(hierarchical_codes)

    return audio_hat.squeeze().clamp(-1, 1).cpu().numpy()

# --- Example Usage ---

In [None]:
# speaker_name = "kavya"
speakers = ["kavya", "agastya", "maitri", "vinaya"]
language = "mixed"
# text_hindi = "आज मैंने एक नई तकनीक के बारे में सीखा जो कृत्रिम बुद्धिमत्ता का उपयोग करके मानव जैसी आवाज़ उत्पन्न कर सकती है।"
# text_english = "Today I learned about a new technology that uses artificial intelligence to generate human-like voices."
text_mixed = "मैं तो पूरा presentation prepare कर चुका हूं! कल रात को ही मैंने पूरा code base चेक किया।"

for speaker_name in speakers:
  file_name = f"comb_output_{language}_{speaker_name}"+".wav"
  # Hindi
  audio = generate_speech(text_mixed, speaker=speaker_name)
  sf.write(file_name, audio, 24000)
  print("Done for", speaker_name)

Done for kavya
Done for agastya
Done for maitri
Done for vinaya


In [None]:
#Audio("/content/comb_output_english_agastya.wav")
Audio("/content/comb_output_mixed_agastya.wav")

In [None]:
# Audio("/content/comb_output_hindi_kavya.wav")
Audio("/content/comb_output_mixed_kavya.wav") #kavya is english when given mixed text.

In [None]:
# Audio("/content/comb_output_hindi_maitri.wav")
Audio("/content/comb_output_mixed_maitri.wav")

In [None]:
# Audio("/content/comb_output_hindi_vinaya.wav")
Audio("/content/comb_output_mixed_vinaya.wav")

In [None]:
from IPython.display import Audio
Audio("output_hindi_kavya.wav")

In [None]:
# English
text_english = "Today I learned about a new technology that uses artificial intelligence to generate human-like voices."
audio = generate_speech(text_english, speaker="vinaya")
sf.write("output_english_vinaya.wav", audio, 24000)


In [None]:
Audio("output_english_vinaya.wav")

In [None]:
Audio("comb_output_hindi_vinaya.wav") #girl

In [None]:
Audio("comb_output_english_vinaya.wav") #girl

In [None]:
Audio("comb_output_mixed_vinaya.wav") #girl

In [None]:
Audio("comb_output_english_kavya.wav") #girl

In [None]:
Audio("comb_output_hindi_kavya.wav") #girl

In [None]:
Audio("comb_output_mixed_kavya.wav") #boy

In [None]:
Audio("comb_output_english_maitri.wav") #girl

In [None]:
Audio("comb_output_hindi_maitri.wav") #girl

In [None]:
Audio("comb_output_mixed_maitri.wav") #girl

In [None]:
Audio("comb_output_english_agastya.wav") #girl

In [None]:
Audio("comb_output_hindi_agastya.wav") #girl

In [None]:
Audio("comb_output_mixed_agastya.wav") #girl

In [None]:


# Code-mixed
text_mixed = "मैं तो पूरा presentation prepare कर चुका हूं! कल रात को ही मैंने पूरा code base चेक किया।"
audio = generate_speech(text_mixed, speaker="maitri")
sf.write("output_mixed_maitri.wav", audio, 24000)
Audio("output_mixed_maitri.wav")

In [None]:


# Code-mixed
text_mixed = "मैं तो पूरा presentation prepare कर चुका हूं! कल रात को ही मैंने पूरा code base चेक किया।"
audio = generate_speech(text_mixed, speaker="vinaya")
sf.write("output_mixed_vinaya.wav", audio, 24000)
Audio("output_mixed_vinaya.wav")

In [None]:
Audio("output_english_agastya.wav")

## Local Inference on GPU
Model page: https://huggingface.co/maya-research/Veena

⚠️ If the generated code snippets do not work, please open an issue on either the [model repo](https://huggingface.co/maya-research/Veena)
			and/or on [huggingface.js](https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/src/model-libraries-snippets.ts) 🙏

In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-to-speech", model="maya-research/Veena")

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("maya-research/Veena")
model = AutoModelForCausalLM.from_pretrained("maya-research/Veena")