<a href="https://colab.research.google.com/github/soumenhalder/audio_translator/blob/main/Audio_IO.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Description

In this notebook the live audio input, and human voice output is explored

## Audio I/O (library install + define functions)


### Install libraries

In [None]:
!pip install sounddevice
!pip install soundfile
!pip install gTTS ## google tts
!pip install -q git+https://github.com/openai/whisper.git # whisper
# Torch with CUDA (Colab usually has this preinstalled, but update if needed)
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
# Transformers for NLLB translation
!pip install -U transformers
# SentencePiece and other tokenizer dependencies
!pip install sentencepiece sacremoses
!pip install ailia

Collecting sounddevice
  Downloading sounddevice-0.5.2-py3-none-any.whl.metadata (1.6 kB)
Downloading sounddevice-0.5.2-py3-none-any.whl (32 kB)
Installing collected packages: sounddevice
Successfully installed sounddevice-0.5.2
Collecting gTTS
  Downloading gTTS-2.5.4-py3-none-any.whl.metadata (4.1 kB)
Collecting click<8.2,>=7.1 (from gTTS)
  Downloading click-8.1.8-py3-none-any.whl.metadata (2.3 kB)
Downloading gTTS-2.5.4-py3-none-any.whl (29 kB)
Downloading click-8.1.8-py3-none-any.whl (98 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: click, gTTS
  Attempting uninstall: click
    Found existing installation: click 8.2.1
    Uninstalling click-8.2.1:
      Successfully uninstalled click-8.2.1
Successfully installed click-8.1.8 gTTS-2.5.4
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyp

### Audio input

In [None]:
def record_audio_colab(duration_sec=5, resample_to_16k=True, output_dir='Output'):
    from IPython.display import Audio, display, Javascript
    from google.colab import output
    import base64
    import io, os
    import soundfile as sf
    import librosa
    import warnings

    os.makedirs(output_dir, exist_ok=True)
    # JavaScript for browser recording
    RECORD = f"""
    const sleep = time => new Promise(resolve => setTimeout(resolve, time))
    const b2text = blob => new Promise(resolve => {{
      const reader = new FileReader()
      reader.onloadend = e => resolve(reader.result)
      reader.readAsDataURL(blob)
    }})

    var record = async function(){{
      const stream = await navigator.mediaDevices.getUserMedia({{ audio: true }})
      const recorder = new MediaRecorder(stream)
      const chunks = []
      recorder.ondataavailable = e => chunks.push(e.data)
      recorder.start()
      await sleep({duration_sec * 1000})  // Record for given duration
      recorder.stop()
      await new Promise(resolve => recorder.onstop = resolve)
      const blob = new Blob(chunks)
      const b64 = await b2text(blob)
      stream.getTracks().forEach(track => track.stop())
      return b64
    }}
    """

    # Start recording
    print("Recording... Speak now.")
    output.eval_js(RECORD)
    recorded_audio = output.eval_js("record()")
    print("Done recording!")

    # Decode base64 audio and save
    b64_audio = recorded_audio.split(',')[1]
    audio_bytes = base64.b64decode(b64_audio)
    with open(f"{output_dir}/recorded.wav", "wb") as f:
        f.write(audio_bytes)

    print("Saved recorded.wav")

    # Optional: resample to 16 kHz
    if resample_to_16k:
        warnings.simplefilter("ignore")
        y, sr = librosa.load(f"{output_dir}/recorded.wav", sr=16000)
        sf.write(f"{output_dir}/recorded_16k.wav", y, 16000)
        print("Saved recorded_16k.wav (16 kHz)")
        display(Audio(f"{output_dir}/recorded_16k.wav"))
        return f"{output_dir}/recorded_16k.wav", 16000
    else:
        display(Audio(f"{output_dir}/recorded.wav"))
        data, sr = sf.read(f"{output_dir}/recorded.wav")
        return f"{output_dir}/recorded.wav", sr


In [None]:
#audio_i_file, _ = record_audio_colab(duration_sec=5, resample_to_16k=True)

### Audio output

In [None]:
from gtts import gTTS
from IPython.display import Audio

def speak_text(text, lang='en', filename='speech.mp3'):
    """
    Convert text to speech and play it in Colab.

    Args:
        text (str): The text to convert to speech.
        lang (str): Language code (default 'en').
        filename (str): Output audio file name (default 'speech.mp3').

    Returns:
        IPython.display.Audio: Audio player widget for the generated speech.
    """
    tts = gTTS(text=text, lang=lang)
    tts.save(filename)
    return filename

# Example usage:
#Audio(speak_text("Hello, I am a robot speaking from Colab!"))
#Audio(speak_text("Bonjour, comment ça va ?", lang='fr'))


### Transcriptio and translation

#### Translator with helsinki-nlp

In [None]:
from google.colab import userdata
import os
# Load Hugging Face token from Colab secrets
HF_TOKEN = userdata.get('HF_TOKEN')
os.environ['HF_TOKEN'] = HF_TOKEN

# Verify token is loaded
assert HF_TOKEN.startswith('hf_'), "Invalid Hugging Face token! Check your Colab secrets."
import os
import whisper
from transformers import pipeline
from typing import Optional, Union
import torch

import glob
import numpy as np
import warnings
import pandas as pd
import tqdm

class SpeechTranslator_OLD:
    def __init__(self, model_size: str = "medium"):

        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model = whisper.load_model(model_size, device=self.device)
        self.translation_models = {}  # Cache for non-English translators

    def process_audio(
        self,
        audio_path: str,
        translate: bool = False,
        target_lang: str = "en",  # ISO-639-1 code (default: English)
        source_lang: Optional[str] = None  # Auto-detect if None
    ) -> str:
        """
        Smart routing:
        - If target==English: Use Whisper's native translation
        - Else: Transcribe first, then translate with external model
        """
        # Case 1: Use Whisper's fast native English translation
        if translate and target_lang == "en":
            result = self.model.transcribe(
                audio_path,
                task="translate",  # Whisper's built-in translation to English
                language=source_lang,
                fp16=(self.device == "cuda")
            )
            return result["text"]

        # Case 2: Transcribe only (no translation)
        elif not translate:
            result = self.model.transcribe(
                audio_path,
                language=source_lang,
                fp16=(self.device == "cuda")
            )
            return result["text"]

        # Case 3: Non-English translation (transcribe → external translate)
        else:
            # Step 1: Transcribe in source language
            result = self.model.transcribe(
                audio_path,
                language=source_lang,
                fp16=(self.device == "cuda")
            )
            source_text = result["text"]

            # Step 2: Translate with external model
            return self._translate_text(
                text=source_text,
                source_lang=source_lang,
                target_lang=target_lang
            )

    def _translate_text(
        self,
        text: str,
        source_lang: str,
        target_lang: str
    ) -> str:
        """Lazy-load translation model with fallback logic"""
        from huggingface_hub import model_info
        from google.colab import userdata

        model_key = f"{source_lang}-{target_lang}"
        fallback_used = False

        try:
            # First try direct translation
            if model_key not in self.translation_models:
                # Check if model exists
                try:
                    model_info(f"Helsinki-NLP/opus-mt-{model_key}")
                    hf_token = userdata.get('HF_TOKEN')
                    self.translation_models[model_key] = pipeline(
                        "translation",
                        model=f"Helsinki-NLP/opus-mt-{model_key}",
                        device=self.device,
                        token=hf_token
                    )
                except Exception as e:
                    print(f"Direct translation not available for {model_key}, falling back to English intermediate")
                    fallback_used = True

                    # Load source→English and English→target models
                    hf_token = userdata.get('HF_TOKEN')
                    self.translation_models[f"{source_lang}-en"] = pipeline(
                        "translation",
                        model=f"Helsinki-NLP/opus-mt-{source_lang}-en",
                        device=self.device,
                        token=hf_token
                    )
                    self.translation_models[f"en-{target_lang}"] = pipeline(
                        "translation",
                        model=f"Helsinki-NLP/opus-mt-en-{target_lang}",
                        device=self.device,
                        token=hf_token
                    )

            if not fallback_used:
                return self.translation_models[model_key](text)[0]["translation_text"]
            else:
                # Two-step translation via English
                english = self.translation_models[f"{source_lang}-en"](text)[0]["translation_text"]
                return self.translation_models[f"en-{target_lang}"](english)[0]["translation_text"]

        except Exception as e:
            print(f"Translation failed completely: {e}")
            return text  # Return original text if all fails



#### Translator with  Meta’s NLLB-200

In [1]:

from google.colab import userdata
import os
# Load Hugging Face token from Colab secrets
HF_TOKEN = userdata.get('HF_TOKEN')
os.environ['HF_TOKEN'] = HF_TOKEN

# Verify token is loaded
assert HF_TOKEN.startswith('hf_'), "Invalid Hugging Face token! Check your Colab secrets."
import os
from google.colab import userdata
import os
import whisper
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from typing import Optional
import torch

class SpeechTranslator:
    def __init__(self, model_size: str = "medium"):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model = whisper.load_model(model_size, device=self.device)

        # Load NLLB model only once
        model_id = "facebook/nllb-200-distilled-600M"
        self.tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=userdata.get("HF_TOKEN"))
        self.translator = AutoModelForSeq2SeqLM.from_pretrained(model_id, use_auth_token=userdata.get("HF_TOKEN")).to(self.device)

    def process_audio(
        self,
        audio_path: str,
        translate: bool = False,
        target_lang: str = "en",  # ISO-639-1
        source_lang: Optional[str] = None
    ) -> str:
        """
        Smart routing:
        - If target == English: Use Whisper's built-in translation
        - Else: transcribe → translate with NLLB
        """
        if translate and target_lang == "en":
            result = self.model.transcribe(
                audio_path,
                task="translate",
                language=source_lang,
                fp16=(self.device == "cuda")
            )
            return result["text"]

        elif not translate:
            result = self.model.transcribe(
                audio_path,
                language=source_lang,
                fp16=(self.device == "cuda")
            )
            return result["text"]

        else:
            result = self.model.transcribe(
                audio_path,
                language=source_lang,
                fp16=(self.device == "cuda")
            )
            source_text = result["text"]

            return self._translate_text(source_text, source_lang, target_lang)

    def _translate_text(
        self,
        text: str,
        source_lang: str,
        target_lang: str
    ) -> str:
        """
        Translate using Meta's NLLB-200 model.
        Language codes follow ISO-639-1 → NLLB language tags.
        """

        # ISO639-1 → NLLB language codes
        lang_map = {
            'en': 'eng_Latn',
            'hi': 'hin_Deva',
            'bn': 'ben_Beng',
            'ta': 'tam_Taml',
            'te': 'tel_Telu',
            'ml': 'mal_Mlym',
            'gu': 'guj_Gujr',
            'mr': 'mar_Deva',
            'kn': 'kan_Knda',
            'ur': 'urd_Arab',
            'fr': 'fra_Latn',
            'de': 'deu_Latn',
            'es': 'spa_Latn',
            'zh': 'zho_Hans',
            'ar': 'arb_Arab',
            'ru': 'rus_Cyrl',
            'ja': 'jpn_Jpan',
            'ko': 'kor_Hang'
        }

        # Language code validation
        try:
            src_code = lang_map[source_lang]
            tgt_code = lang_map[target_lang]
        except KeyError:
            raise ValueError(f"Unsupported language code: {source_lang} or {target_lang}")

        # Load model and tokenizer once if not already loaded
        if not hasattr(self, 'nllb_tokenizer'):
            model_name = "facebook/nllb-200-distilled-600M"
            from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
            self.nllb_tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
            self.nllb_model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(self.device)

        # Set source language
        self.nllb_tokenizer.src_lang = src_code

        # Tokenize input
        inputs = self.nllb_tokenizer(text, return_tensors="pt").to(self.device)

        # Get target language token ID
        tgt_lang_id = self.nllb_tokenizer.convert_tokens_to_ids(tgt_code)

        # Generate translation
        generated = self.nllb_model.generate(
            **inputs,
            forced_bos_token_id=tgt_lang_id,
            max_length=512
        )

        # Decode output
        return self.nllb_tokenizer.batch_decode(generated, skip_special_tokens=True)[0]


ModuleNotFoundError: No module named 'whisper'

###  Full flow in a single fun

In [None]:
from typing import Optional
from IPython.display import Audio
def audio_io(
    input_audio: str = None,
    translate: bool = False,
    target_lang: str = 'en',
    source_lang: Optional[str] = None,
    duration_sec: int = 5,
    speak_lang: str = 'en',
    whisper_model_size: str = "medium"
) -> str:
    """
    Records audio, optionally translates it, and speaks the output.

    Parameters:
        translate (bool): Whether to translate the audio input.
        target_lang (str): Target language (ISO-639-1 code).
        source_lang (Optional[str]): Source language; if None, auto-detect.
        duration_sec (int): Duration of audio recording in seconds.
        speak_lang (str): Language used for speech output.

    Returns:
        str: Transcribed (and possibly translated) text.
    """
    if not input_audio:
      # Step 1: Record audio
      audio_i_file, _ = record_audio_colab(duration_sec=duration_sec, resample_to_16k=True)
    else:
      audio_i_file = input_audio


    # Step 2: Transcribe/translate audio
    print("Transcription procedure is on...")
    translator = SpeechTranslator(model_size=whisper_model_size)
    text = translator.process_audio(
        audio_path=audio_i_file,
        translate=translate,
        target_lang=target_lang,
        source_lang=source_lang
    )
    print("The transcription/translation:", text)
    # Step 3: Speak the result
    return Audio(speak_text(text, lang=speak_lang, filename='speech.mp3'))

### Audio input, extracted, transcripted and output

In [None]:
from typing import Optional
from IPython.display import Audio
import subprocess
from google.colab import drive
drive.mount('/content/drive')

def translatorApp(
    extract_source:  bool = True,
    reference_audio : str = '',
    input_audio: str = None,
    translate: bool = False,
    target_lang: str = 'en',
    source_lang: Optional[str] = None,
    duration_sec: int = 5,
    speak_lang: str = None,
    whisper_model_size: str = "medium",
    output_dir: str='Output'
) -> str:
    """
    Records audio, optionally translates it, and speaks the output.

    Parameters:
        translate (bool): Whether to translate the audio input.
        target_lang (str): Target language (ISO-639-1 code).
        source_lang (Optional[str]): Source language; if None, auto-detect.
        duration_sec (int): Duration of audio recording in seconds.
        speak_lang (str): Language used for speech output.

    Returns:
        str: Transcribed (and possibly translated) text.
    """
    os.makedirs(output_dir, exist_ok=True)
    if not input_audio:
      # Step 1: Record audio
      audio_i_file, _ = record_audio_colab(duration_sec=duration_sec, resample_to_16k=True, output_dir=output_dir)
    else:
      audio_i_file = input_audio

    if extract_source:
      if reference_audio == '':
        raise ValueError('target audio can not be empty for source audio extraction')
      elif os.path.exists(reference_audio):
          library_path = '/content/drive/MyDrive/ColabData/voice_filter_ailia/'
          command = [
              "python",
              f"{library_path}/voicefilter.py",
              "--input", audio_i_file,
              "--reference_file", reference_audio,
              "--savepath", output_dir + "/"
          ]
          subprocess.run(command)
          root, ext = os.path.splitext(audio_i_file)
          extracted_audio = f"{root}_res{ext}"

      else:
        raise ValueError('Reference audio does not exist')
    # Step 2: Transcribe/translate audio
    print("Transcription procedure is on...")


    translator = SpeechTranslator(model_size=whisper_model_size)
    text = translator.process_audio(
        audio_path=extracted_audio if extract_source else audio_i_file ,
        translate=translate,
        target_lang=target_lang,
        source_lang=source_lang
    )
    print("The transcription/translation:", text)
    # Step 3: Speak the result
    return Audio(speak_text(text, lang= target_lang if speak_lang is None else speak_lang, filename=f'{output_dir}/speech.mp3'))

Mounted at /content/drive


## Testing the flow

### Audio I/O no extraction

In [None]:
audio_io(whisper_model_size='turbo', duration_sec=20, translate=False, source_lang='en', target_lang='en')

### From stored audio

In [None]:
reference_audio = '/content/drive/MyDrive/ColabData/50_speakers_audio_data/Speaker0050/Speaker0050_000.wav'
mixed_audio =  '/content/drive/MyDrive/ColabData/AudioMix/Cocktail_5+15+4/Speaker0050_025_mix.wav'
translatorApp(
    extract_source = True,
    reference_audio  = reference_audio,
    input_audio = mixed_audio,
    translate  = False,
    target_lang = 'en',
    source_lang = 'en',
    duration_sec = 10,
    speak_lang  = 'en',
    whisper_model_size  = "medium",
    output_dir ='OutputDir'
)

In [None]:
Audio(mixed_audio)

In [None]:
Audio(reference_audio)

In [None]:
reference_audio = '/content/drive/MyDrive/ColabData/50_speakers_audio_data/Speaker0050/Speaker0050_000.wav'
mixed_audio =  '/content/drive/MyDrive/ColabData/AudioMix/Cocktail_5+15+4/Speaker0050_025_mix.wav'
translatorApp(
    extract_source = True,
    reference_audio  = reference_audio,
    input_audio = mixed_audio,
    translate  = True,
    target_lang = 'bn',
    source_lang = 'en',
    duration_sec = 10,
    speak_lang  = None,
    whisper_model_size  = "medium",
    output_dir ='OutputDir'
)

### Testing nllb

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

# Load Meta NLLB-200 distilled model
model_name = "facebook/nllb-200-distilled-600M"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Language codes
src_lang = "bn_Beng"     # Bengali
tgt_lang = "eng_Latn"    # English

# Set source language
tokenizer.src_lang = src_lang

# Sample Bengali text
text = "আপনি কেমন আছেন?"

# Create translation pipeline
translator = pipeline("translation", model=model, tokenizer=tokenizer)

# Run translation
translated = translator(text, src_lang=src_lang, tgt_lang=tgt_lang)[0]['translation_text']

print("Original:", text)
print("Translated:", translated)


### Live recording

In [None]:
ref_soumen = '/content/drive/MyDrive/ColabData/MySpeakerCollection/Soumen/audio_2025-06-17_16-38-28.ogg'
Audio(ref_soumen)

translatorApp(
    extract_source = True,
    reference_audio  = ref_soumen,
    input_audio = None,
    translate  = True,
    target_lang = 'fr',
    source_lang = 'en',
    duration_sec = 30,
    speak_lang  = None,
    whisper_model_size  = "medium",
    output_dir ='OutputDir'
)

Recording... Speak now.
Done recording!
Saved recorded.wav
Saved recorded_16k.wav (16 kHz)


Transcription procedure is on...
The transcription/translation: Nous avons une situation assez bonne ici et je pense que nous pouvons attaquer le territoire ennemi et puisque nous sommes bien équipés dans nos structures, nous sommes dans une très bonne situation.


In [None]:
ref_soumen = '/content/drive/MyDrive/ColabData/MySpeakerCollection/Soumen/audio_2025-06-17_16-38-28.ogg'
Audio(ref_soumen)

translatorApp(
    extract_source = True,
    reference_audio  = ref_soumen,
    input_audio = None,
    translate  = True,
    target_lang = 'fr',
    source_lang = 'hi',
    duration_sec = 30,
    speak_lang  = None,
    whisper_model_size  = "turbo",
    output_dir ='OutputDir'
)

Recording... Speak now.
Done recording!
Saved recorded.wav
Saved recorded_16k.wav (16 kHz)


Transcription procedure is on...
The transcription/translation: Le premier ministre Narendra Modi n'a pas tenu le sommet en tant que géstion.


In [None]:
ref_soumen = '/content/drive/MyDrive/ColabData/MySpeakerCollection/Soumen/audio_2025-06-17_16-38-28.ogg'
Audio(ref_soumen)

translatorApp(
    extract_source = True,
    reference_audio  = ref_soumen,
    input_audio = None,
    translate  = True,
    target_lang = 'fr',
    source_lang = 'en',
    duration_sec = 30,
    speak_lang  = None,
    whisper_model_size  = "turbo",
    output_dir ='OutputDir'
)

Recording... Speak now.
Done recording!
Saved recorded.wav
Saved recorded_16k.wav (16 kHz)


Transcription procedure is on...
The transcription/translation: Blackstone a acquis South City Mall à Kolkata pour ses 3 250 000 dollars par la plupart des capitales du Bengale dans le plus grand marché immobilier à ce jour.


In [None]:
ref_soumen = '/content/drive/MyDrive/ColabData/MySpeakerCollection/Soumen/audio_2025-06-17_16-38-28.ogg'
Audio(ref_soumen)

translatorApp(
    extract_source = True,
    reference_audio  = ref_soumen,
    input_audio = None,
    translate  = True,
    target_lang = 'fr',
    source_lang = 'en',
    duration_sec = 15,
    speak_lang  = None,
    whisper_model_size  = "turbo",
    output_dir ='OutputDir'
)

Recording... Speak now.
Done recording!
Saved recorded.wav
Saved recorded_16k.wav (16 kHz)


Transcription procedure is on...
The transcription/translation: La réglementation de l'IA en Inde est un modèle et un dirigeant de Google.


In [None]:
Audio("OutputDir/recorded_16k_res.wav")