These are ASR models: \
en: https://drive.google.com/drive/folders/1mqo_d5Mcf956iH0n0xIS4X_NOZ1vXvya?usp=drive_link \
hi: https://drive.google.com/drive/folders/182ZNPLnSvodBAlNLsRzlY3D4uGGoS4OB?usp=drive_link \
ta: https://drive.google.com/drive/folders/1rbXRoqvfA5GowbX_R0Cngb7QSDvv8E48?usp=drive_link

These are the MT models (each of the link has a model and its respective tokenizer):\
en -> hi: https://drive.google.com/drive/folders/1LbAIBXrOMtIGRfVm69zWIUnA-2esyOhL?usp=drive_link \

en -> ta: https://drive.google.com/drive/folders/1l1X1P1wnmovINpSjnc55Fh9KgmrqHMqh?usp=drive_link \

hi -> en: https://drive.google.com/drive/folders/1kxiyoAiEGFvtjqyhMygK3JipKOHZsFV5?usp=drive_link \

ta -> en: https://drive.google.com/drive/folders/1kStRq1b3bnK8hZdwSirxQdHxnJdVEZMQ?usp=drive_link

These are the TTS models: \
en: https://drive.google.com/drive/folders/1PG4rTH3Ul41Sb33LZ6A9OSsXH_Gqfx_A?usp=drive_link \
hi: https://drive.google.com/drive/folders/1ixGccj10mTFaYneoPPbpLqXMNsHgNKYs?usp=drive_link \
ta: https://drive.google.com/drive/folders/1vFF5THwn_9Rsb593H_i79BXAjmQAphh_?usp=drive_link

# Install necessary dependencies and libraries

In [2]:
!apt-get install -y portaudio19-dev
!pip install pyaudio -q

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  libportaudio2 libportaudiocpp0
Suggested packages:
  portaudio19-doc
The following NEW packages will be installed:
  libportaudio2 libportaudiocpp0 portaudio19-dev
0 upgraded, 3 newly installed, 0 to remove and 30 not upgraded.
Need to get 188 kB of archives.
After this operation, 927 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libportaudio2 amd64 19.6.0-1.1 [65.3 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libportaudiocpp0 amd64 19.6.0-1.1 [16.1 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 portaudio19-dev amd64 19.6.0-1.1 [106 kB]
Fetched 188 kB in 1s (242 kB/s)
Selecting previously unselected package libportaudio2:amd64.
(Reading database ... 126315 files and directories currently installed.)
Preparing to unpack .../libportaudio2_19.6.0-1.

In [3]:
! pip install parler-tts -q
!pip install -U huggingface_hub[hf_xet] -q

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/80.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m80.2/80.2 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.7/100.7 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.6/59.6 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdon

In [4]:
import torch
import librosa
import numpy as np
import pyaudio
import wave
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, MBartForConditionalGeneration, MBart50Tokenizer, AutoTokenizer
from parler_tts import ParlerTTSForConditionalGeneration
from IPython.display import Audio
from huggingface_hub import HfApi
import soundfile as sf



# Model setup and loading

In [5]:
# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [6]:
# Model paths
ASR_MODELS = {
    "en": "SrihariGKS/wav2vec-asr-fine-tuned-english-3",
    "hi": "SrihariGKS/wav2vec-asr-fine-tuned-hindi-3",
    "ta": "SrihariGKS/wav2vec-asr-fine-tuned-tamil-6"
}

MT_MODELS = {
    ("en", "hi"): "SrihariGKS/mbart-mt-fine-tuned-model-en-hi",
    ("hi", "en"): "SrihariGKS/mbart-mt-fine-tuned-model-hi-en",
    ("en", "ta"): "SrihariGKS/mbart-mt-fine-tuned-model-en-ta",
    ("ta", "en"): "SrihariGKS/mbart-mt-fine-tuned-model-ta-en"
}

MT_TOKENIZERS = {
    ("en", "hi"): "SrihariGKS/mbart-mt-fine-tuned-token-en-hi",
    ("hi", "en"): "SrihariGKS/mbart-mt-fine-tuned-token-hi-en",
    ("en", "ta"): "SrihariGKS/mbart-mt-fine-tuned-token-en-ta",
    ("ta", "en"): "SrihariGKS/mbart-mt-fine-tuned-token-ta-en"
}

TTS_MODELS = {
    "en": "SrihariGKS/parler-tts-fine-tuned-english",
    "hi": "SrihariGKS/parler-tts-fine-tuned-hindi-3",
    "ta": "SrihariGKS/parler-tts-fine-tuned-tamil-3"
}

# Preload all models
def preload_models():
    supported_langs = ["en", "hi", "ta"]

    # ASR models and processors
    asr_models = {}
    asr_processors = {}
    for lang in supported_langs:
        print(f"Loading ASR model for {lang}...")
        model = Wav2Vec2ForCTC.from_pretrained(ASR_MODELS[lang]).to(device)
        processor = Wav2Vec2Processor.from_pretrained(ASR_MODELS[lang])
        asr_models[lang] = model
        asr_processors[lang] = processor

    # MT models and tokenizers
    mt_models = {}
    mt_tokenizers = {}
    for (src_lang, tgt_lang) in MT_MODELS.keys():
        print(f"Loading MT model for {src_lang} -> {tgt_lang}...")
        model = MBartForConditionalGeneration.from_pretrained(MT_MODELS[(src_lang, tgt_lang)]).to(device)
        tokenizer = MBart50Tokenizer.from_pretrained(MT_TOKENIZERS[(src_lang, tgt_lang)])
        mt_models[(src_lang, tgt_lang)] = model
        mt_tokenizers[(src_lang, tgt_lang)] = tokenizer

    # TTS models and tokenizers
    tts_models = {}
    tts_tokenizers = {}
    tts_desc_tokenizers = {}
    for lang in supported_langs:
        print(f"Loading TTS model for {lang}...")
        model = ParlerTTSForConditionalGeneration.from_pretrained(
            TTS_MODELS[lang], torch_dtype=torch.float16
        ).to(device)
        tokenizer = AutoTokenizer.from_pretrained(TTS_MODELS[lang])
        desc_tokenizer = AutoTokenizer.from_pretrained(model.config.text_encoder._name_or_path)
        tts_models[lang] = model
        tts_tokenizers[lang] = tokenizer
        tts_desc_tokenizers[lang] = desc_tokenizer

    return asr_models, asr_processors, mt_models, mt_tokenizers, tts_models, tts_tokenizers, tts_desc_tokenizers

# Live audio capture - Not used

In [None]:
# Capture live audio from microphone
def capture_live_audio(duration=5, sample_rate=16000, chunk=1024, channels=1, output_file="live_audio.wav"):
    p = pyaudio.PyAudio()
    stream = p.open(format=pyaudio.paInt16,
                    channels=channels,
                    rate=sample_rate,
                    input=True,
                    frames_per_buffer=chunk)

    print(f"Recording for {duration} seconds...")
    frames = []
    for _ in range(0, int(sample_rate / chunk * duration)):
        data = stream.read(chunk)
        frames.append(data)

    print("Recording finished.")
    stream.stop_stream()
    stream.close()
    p.terminate()

    wf = wave.open(output_file, 'wb')
    wf.setnchannels(channels)
    wf.setsampwidth(p.get_sample_size(pyaudio.paInt16))
    wf.setframerate(sample_rate)
    wf.writeframes(b''.join(frames))
    wf.close()
    return output_file

# Transcription, Translation and Audio Generation functions

In [7]:
# ASR: Transcribe audio to text
def transcribe_audio(file_path, asr_model, asr_processor):
    audio_input, sr = librosa.load(file_path, sr=16000)
    min_length = 16000
    if len(audio_input) < min_length:
        audio_input = np.pad(audio_input, (0, min_length - len(audio_input)), mode='constant')
    inputs = asr_processor(audio_input, sampling_rate=16000, return_tensors="pt")
    input_values = inputs.input_values.to(device)
    with torch.no_grad():
        logits = asr_model(input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = asr_processor.decode(predicted_ids[0])
    return transcription

In [8]:
# MT: Translate text with pivot if necessary
def translate_text_with_pivot(text, src_lang, tgt_lang, mt_models, mt_tokenizers):
    if src_lang == tgt_lang:
        return text

    if (src_lang, tgt_lang) in mt_models:
        # Direct translation
        mt_model = mt_models[(src_lang, tgt_lang)]
        mt_tokenizer = mt_tokenizers[(src_lang, tgt_lang)]
        translated_text = translate_text(text, mt_model, mt_tokenizer)
        return translated_text
    else:
        # Pivot through English
        if src_lang != "en":
            mt_model1 = mt_models[(src_lang, "en")]
            mt_tokenizer1 = mt_tokenizers[(src_lang, "en")]
            intermediate_text = translate_text(text, mt_model1, mt_tokenizer1)
            print(f"Intermediate Text (en): {intermediate_text}")
        else:
            intermediate_text = text

        if tgt_lang != "en":
            mt_model2 = mt_models[("en", tgt_lang)]
            mt_tokenizer2 = mt_tokenizers[("en", tgt_lang)]
            translated_text = translate_text(intermediate_text, mt_model2, mt_tokenizer2)
            return translated_text
        else:
            return intermediate_text

# Helper MT function
def translate_text(text, mt_model, mt_tokenizer):
  # torch.manual_seed(42)
  inputs = mt_tokenizer(text, return_tensors="pt", max_length=128, truncation=True).input_ids.to(device)
  outputs = mt_model.generate(inputs, max_length=128, num_beams=4, early_stopping=True)
  translated_text = mt_tokenizer.decode(outputs[0], skip_special_tokens=True)
  return translated_text

In [9]:
# TTS: Convert text to speech
def text_to_speech(text, tts_model, tts_tokenizer, tts_desc_tokenizer):
    prompt = text
    description = "'Jaya delivers her words quite expressively, in a very confined sounding environment with clear audio quality.'"
    input_ids = tts_desc_tokenizer(description, return_tensors="pt").to(device)
    prompt_input_ids = tts_tokenizer(prompt, return_tensors="pt").to(device)
    generation = tts_model.generate(input_ids=input_ids.input_ids, attention_mask=input_ids.attention_mask, prompt_input_ids=prompt_input_ids.input_ids, prompt_attention_mask=prompt_input_ids.attention_mask)
    audio_arr = generation.cpu().numpy().squeeze()
    audio_arr = audio_arr.astype("float32")
    return audio_arr

# Pipeline of ASR, MT and TTS

In [10]:
# Speech-to-speech pipeline
def speech_to_speech_pipeline(src_lang, tgt_lang, asr_models, asr_processors, mt_models, mt_tokenizers,
                            tts_models, tts_tokenizers, tts_desc_tokenizers, live_audio=True, audio_file_path=None, record_duration=5):
    supported_langs = ["en", "hi", "ta"]
    if src_lang not in supported_langs or tgt_lang not in supported_langs:
        raise ValueError("Unsupported language. Choose from 'en', 'hi', or 'ta'.")

    # Step 1: Get audio input
    if live_audio:
        audio_file_path = capture_live_audio(duration=record_duration)
    elif audio_file_path is None:
        raise ValueError("Audio file path must be provided if live_audio is False.")

    # Step 2: Transcribe using preloaded ASR model
    asr_model = asr_models[src_lang]
    asr_processor = asr_processors[src_lang]
    transcription = transcribe_audio(audio_file_path, asr_model, asr_processor)
    print(f"Transcription ({src_lang}): {transcription}")

    # Step 3: Translate with pivot if necessary
    translated_text = translate_text_with_pivot(transcription, src_lang, tgt_lang, mt_models, mt_tokenizers)
    print(f"Translated Text ({tgt_lang}): {translated_text}")

    # Step 4: Generate speech using preloaded TTS model
    tts_model = tts_models[tgt_lang]
    tts_tokenizer = tts_tokenizers[tgt_lang]
    tts_desc_tokenizer = tts_desc_tokenizers[tgt_lang]
    audio_output = text_to_speech(translated_text, tts_model, tts_tokenizer, tts_desc_tokenizer)
    return Audio(audio_output, rate=tts_model.config.sampling_rate)

In [11]:
# Preload all models
print("Preloading all models...")
asr_models, asr_processors, mt_models, mt_tokenizers, tts_models, tts_tokenizers, tts_desc_tokenizers = preload_models()

Preloading all models...
Loading ASR model for en...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/2.12k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/256 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.13k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/536 [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/30.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/548 [00:00<?, ?B/s]

Loading ASR model for hi...


config.json:   0%|          | 0.00/2.12k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/256 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.13k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.22k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/45.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/548 [00:00<?, ?B/s]

Loading ASR model for ta...


config.json:   0%|          | 0.00/2.12k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/256 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.13k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/30.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/548 [00:00<?, ?B/s]

Loading MT model for en -> hi...


config.json:   0%|          | 0.00/1.45k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/256 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/10.9k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/992 [00:00<?, ?B/s]

Loading MT model for hi -> en...


config.json:   0%|          | 0.00/1.45k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/256 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/10.9k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/992 [00:00<?, ?B/s]

Loading MT model for en -> ta...


config.json:   0%|          | 0.00/1.45k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/256 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/10.9k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/992 [00:00<?, ?B/s]

Loading MT model for ta -> en...


config.json:   0%|          | 0.00/1.45k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/256 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/10.9k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/992 [00:00<?, ?B/s]

Loading TTS model for en...


config.json:   0%|          | 0.00/7.86k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.29G [00:00<?, ?B/s]

  WeightNorm.apply(module, name, dim)
  "_name_or_path": "google/flan-t5-base",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "ear

generation_config.json:   0%|          | 0.00/267 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/20.8k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Loading TTS model for hi...


config.json:   0%|          | 0.00/7.41k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.75G [00:00<?, ?B/s]

  "_name_or_path": "google/flan-t5-large",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 2816,
  "d_kv": 64,
  "d_model": 1024,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 24,
  "num_heads": 16,
  "num_layers": 24,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "tie_word_embeddings": false,
  "transformers_version": "4.46.1",
  "use_cache": true,
  "vocab_size": 32128
}

  "_name_or_path": "ylacombe/dac_44khz",
  "architectures": [
    "DacModel"
  ],
  "codebook_dim": 8,
  "codebook_loss_weight": 1.0,
  "codebook_size": 1024,
  "commitment_loss_weight": 0.25,
  "decoder_hidden_si

generation_config.json:   0%|          | 0.00/218 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/990 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/10.3M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/552 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Loading TTS model for ta...


config.json:   0%|          | 0.00/7.41k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.75G [00:00<?, ?B/s]

  "_name_or_path": "google/flan-t5-large",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 2816,
  "d_kv": 64,
  "d_model": 1024,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 24,
  "num_heads": 16,
  "num_layers": 24,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "tie_word_embeddings": false,
  "transformers_version": "4.46.1",
  "use_cache": true,
  "vocab_size": 32128
}

  "_name_or_path": "ylacombe/dac_44khz",
  "architectures": [
    "DacModel"
  ],
  "codebook_dim": 8,
  "codebook_loss_weight": 1.0,
  "codebook_size": 1024,
  "commitment_loss_weight": 0.25,
  "decoder_hidden_si

generation_config.json:   0%|          | 0.00/218 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/990 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/10.3M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/552 [00:00<?, ?B/s]

# For uploading from device

In [12]:
from google.colab import files
from IPython.display import display, Audio
import shutil
import uuid

uploaded = files.upload()

for fname in uploaded.keys():
    uploaded_file_path = f"/content/{fname}"
    print(f"Uploaded file: {uploaded_file_path}")

source_language = input("Enter source language (en/hi/ta): ").lower()
target_language = input("Enter target language (en/hi/ta): ").lower()

try:
    translated_audio = speech_to_speech_pipeline(
        source_language, target_language,
        asr_models, asr_processors,
        mt_models, mt_tokenizers,
        tts_models, tts_tokenizers, tts_desc_tokenizers,
        live_audio=False,
        audio_file_path=uploaded_file_path
    )

    display(translated_audio)

except Exception as e:
    print(f"Error: {e}")

Saving recorded.wav to recorded (1).wav
Uploaded file: /content/recorded (1).wav
Enter source language (en/hi/ta): en
Enter target language (en/hi/ta): hi


  audio_input, sr = librosa.load(file_path, sr=16000)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Transcription (en): greetings every one
Translated Text (hi): नमस्कार, हर एक




# For single sentence

In [None]:
from IPython.display import display, Javascript, Audio
from google.colab import output
import base64
import io

RECORD_WITH_BUTTON = """
let div = document.createElement('div');
div.innerHTML = `
  <button id="startBtn">Start Recording</button>
  <button id="stopBtn" disabled>Stop Recording</button>
  <p id="status">Click "Start Recording" to begin.</p>
`;
document.body.appendChild(div);

const sleep = time => new Promise(resolve => setTimeout(resolve, time));
const b2text = blob => new Promise(resolve => {
  const reader = new FileReader();
  reader.onloadend = () => resolve(reader.result);
  reader.readAsDataURL(blob);
});

var record = async function() {
  const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
  const mediaRecorder = new MediaRecorder(stream);
  var audioChunks = [];

  document.getElementById("startBtn").disabled = true;
  document.getElementById("stopBtn").disabled = false;
  document.getElementById("status").innerText = "Recording... Press Stop when done.";

  mediaRecorder.ondataavailable = e => {
    if (e.data.size > 0) {
      audioChunks.push(e.data);
    }
  }

  mediaRecorder.start();

  await new Promise(resolve => {
    document.getElementById("stopBtn").onclick = () => {
      mediaRecorder.stop();
      resolve();
    };
  });

  await new Promise(resolve => mediaRecorder.onstop = resolve);

  const blob = new Blob(audioChunks);
  const base64data = await b2text(blob);
  document.getElementById("status").innerText = "Recording stopped.";
  return base64data;
}
"""

def record_audio():
    display(Javascript(RECORD_WITH_BUTTON))
    print("Please use the UI buttons above to record.")
    audio = output.eval_js('record()')
    print("Recording complete.")

    audio_data = audio.split(',')[1]
    file_path = "/content/recorded.wav"
    with open(file_path, "wb") as f:
        f.write(base64.b64decode(audio_data))
    return file_path

record_audio()

source_language = input("Enter source language (en/hi/ta): ").lower()
target_language = input("Enter target language (en/hi/ta): ").lower()
use_live_audio = False

try:
    audio = speech_to_speech_pipeline(
        source_language, target_language,
        asr_models, asr_processors,
        mt_models, mt_tokenizers,
        tts_models, tts_tokenizers, tts_desc_tokenizers,
        live_audio=use_live_audio,
        audio_file_path="/content/recorded.wav",
        record_duration=5
    )
    display(audio)
except Exception as e:
    print(f"Error: {e}")

<IPython.core.display.Javascript object>

Please use the UI buttons above to record.
Recording complete.
Enter source language (en/hi/ta): en
Enter target language (en/hi/ta): hi


  audio_input, sr = librosa.load(file_path, sr=16000)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Transcription (en): greetings every one
Translated Text (hi): नमस्कार, हर एक


# Continuous speech input

In [None]:
from IPython.display import display, Javascript, Audio
from google.colab import output
import base64
import io

RECORD_WITH_BUTTON = """
let div = document.createElement('div');
div.innerHTML = `
  <button id="startBtn">Start Recording</button>
  <button id="stopBtn" disabled>Stop Recording</button>
  <p id="status">Click "Start Recording" to begin.</p>
`;
document.body.appendChild(div);

const sleep = time => new Promise(resolve => setTimeout(resolve, time));
const b2text = blob => new Promise(resolve => {
  const reader = new FileReader();
  reader.onloadend = () => resolve(reader.result);
  reader.readAsDataURL(blob);
});

var record = async function() {
  const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
  const mediaRecorder = new MediaRecorder(stream);
  let audioChunks = [];

  document.getElementById("startBtn").disabled = true;
  document.getElementById("stopBtn").disabled = false;
  document.getElementById("status").innerText = "Recording... Press Stop when done.";

  mediaRecorder.ondataavailable = e => {
    if (e.data.size > 0) {
      audioChunks.push(e.data);
    }
  };

  return new Promise((resolve) => {
    document.getElementById("stopBtn").onclick = () => {
      mediaRecorder.stop();
    };

    mediaRecorder.onstop = async () => {
      // Stop mic stream
      stream.getTracks().forEach(track => track.stop());

      const blob = new Blob(audioChunks, { type: 'audio/wav' });
      const base64data = await b2text(blob);
      document.getElementById("status").innerText = "Recording stopped.";
      resolve(base64data);
    };

    mediaRecorder.start();
  });
}
"""

def record_audio():
    display(Javascript(RECORD_WITH_BUTTON))
    print("Please use the UI buttons above to record.")
    audio = output.eval_js('record()')
    print("Recording complete.")

    audio_data = audio.split(',')[1]
    file_path = "/content/recorded.wav"
    with open(file_path, "wb") as f:
        f.write(base64.b64decode(audio_data))
    return file_path

while True:
    file_path = record_audio()

    source_language = input("Enter source language (en/hi/ta): ").lower()
    target_language = input("Enter target language (en/hi/ta): ").lower()

    try:
        audio = speech_to_speech_pipeline(
            source_language, target_language,
            asr_models, asr_processors,
            mt_models, mt_tokenizers,
            tts_models, tts_tokenizers, tts_desc_tokenizers,
            live_audio=False,
            audio_file_path=file_path
        )
        display(audio)
    except Exception as e:
        print(f"Error: {e}")

    continue_recording = input("Do you want to record again? (y/n): ").lower()
    if continue_recording != 'y':
        print("Exiting recording loop.")
        break

<IPython.core.display.Javascript object>

Please use the UI buttons above to record.
Recording complete.
Enter source language (en/hi/ta): en
Enter target language (en/hi/ta): hi
Error: name 'speech_to_speech_pipeline' is not defined
Do you want to record again? (y/n): n
Exiting recording loop.
