In [1]:
import torch
import librosa
from transformers import WhisperProcessor, WhisperForConditionalGeneration

print("Loading Whisper model and processor...")
whisper_processor = WhisperProcessor.from_pretrained('openai/whisper-base')
whisper_model = WhisperForConditionalGeneration.from_pretrained('openai/whisper-base')

from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
from langdetect import detect
print("Loading mBART model and tokenizer...")
mbart_model = MBartForConditionalGeneration.from_pretrained('facebook/mbart-large-50-many-to-many-mmt')
mbart_tokenizer = MBart50TokenizerFast.from_pretrained('facebook/mbart-large-50-many-to-many-mmt')


  from .autonotebook import tqdm as notebook_tqdm


Loading Whisper model and processor...


Fetching 1 files: 100%|██████████| 1/1 [00:00<00:00, 4975.45it/s]



Loading mBART model and tokenizer...


In [2]:
def whisper_transcribe(audio_path):
    print(f'Whisper: Transcribing audio file: {audio_path}')
    audio, sr = librosa.load(audio_path, sr=16000)
    audio_input = whisper_processor(audio, return_tensors='pt', sampling_rate=sr)
    generated_ids = whisper_model.generate(
        **audio_input,
        task='transcribe'
    )
    transcription = whisper_processor.batch_decode(generated_ids, skip_special_tokens=True)
    print('Whisper: Transcription completed.')
    return transcription[0]

def whisper_translate(audio_path):
    print(f'Whisper: Translating audio file to English: {audio_path}')
    audio, sr = librosa.load(audio_path, sr=16000)
    audio_input = whisper_processor(audio, return_tensors='pt', sampling_rate=sr)
    generated_ids = whisper_model.generate(
        **audio_input,
        task='translate',
    )
    translation = whisper_processor.batch_decode(generated_ids, skip_special_tokens=True)
    print('Whisper: Translation to English completed.')
    return translation[0]


def mbart_translate_to_english(text):
    print(f'mBART: Translating to English: {text}')
    lang_code = detect(text)
    print(f'mBART: Detected language code: {lang_code}')
    # Map langdetect code to mBART50 code
    lang_map = {
        'ta': 'ta_IN',
        'ml': 'ml_IN',
        'hi': 'hi_IN',
        'bn': 'bn_IN',
        'gu': 'gu_IN',
        'kn': 'kn_IN',
        'mr': 'mr_IN',
        'pa': 'pa_IN',
        'te': 'te_IN',
        'en': 'en_XX'
    }
    src_lang = lang_map.get(lang_code, 'en_XX')
    mbart_tokenizer.src_lang = src_lang
    encoded = mbart_tokenizer(text, return_tensors='pt')
    generated_tokens = mbart_model.generate(**encoded, forced_bos_token_id=mbart_tokenizer.lang_code_to_id['en_XX'])
    translated_text = mbart_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
    print('mBART: Translation completed.')
    return translated_text

from transformers import pipeline
print('Loading Sarvam Translate pipeline...')
sarvam_translate = pipeline('translation', model='sarvamai/sarvam-translate')
def sarvam_translate_to_english(text):
    print(f'Sarvam Translate: Translating to English: {text}')
    result = sarvam_translate(text, src_lang='auto', tgt_lang='en')
    translated_text = result[0]['translation_text']
    print('Sarvam Translate: Translation completed.')
    return translated_text

Loading Sarvam Translate pipeline...


Loading checkpoint shards: 100%|██████████| 2/2 [00:44<00:00, 22.11s/it]

Device set to use mps:0
Device set to use mps:0
The model 'Gemma3ForConditionalGeneration' is not supported for translation. Supported models are ['PeftModelForSeq2SeqLM', 'BartForConditionalGeneration', 'BigBirdPegasusForConditionalGeneration', 'BlenderbotForConditionalGeneration', 'BlenderbotSmallForConditionalGeneration', 'EncoderDecoderModel', 'FSMTForConditionalGeneration', 'GPTSanJapaneseForConditionalGeneration', 'GraniteSpeechForConditionalGeneration', 'LEDForConditionalGeneration', 'LongT5ForConditionalGeneration', 'M2M100ForConditionalGeneration', 'MarianMTModel', 'MBartForConditionalGeneration', 'MT5ForConditionalGeneration', 'MvpForConditionalGeneration', 'NllbMoeForConditionalGeneration', 'PegasusForConditionalGeneration', 'PegasusXForConditionalGeneration', 'PLBartForConditionalGeneration', 'ProphetNetForConditionalGeneration', 'Qwen2AudioForConditionalGeneration', 'SeamlessM4TForTextToText', 'Seaml

In [3]:
audio_file_path = '/Users/amartyanambiar/Projects/AI Salesman/WhatsApp Audio 2025-09-02 at 12.08.22.opus'
transcription = whisper_transcribe(audio_file_path)

whisper_translation = whisper_translate(audio_file_path)

mbart_translation = mbart_translate_to_english(transcription)

sarvam_translate = sarvam_translate_to_english(transcription)

Whisper: Transcribing audio file: /Users/amartyanambiar/Projects/AI Salesman/WhatsApp Audio 2025-09-02 at 12.08.22.opus


Python(11245) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Whisper: Transcription completed.
Whisper: Translating audio file to English: /Users/amartyanambiar/Projects/AI Salesman/WhatsApp Audio 2025-09-02 at 12.08.22.opus
Whisper: Translation to English completed.
mBART: Translating to English:  கண்ணா பண்ணிங்கிதான் கூட்டும் கூட்டும் மாறு, சிங்கும் சிங்கிலாதான் வரு.
Whisper: Translation to English completed.
mBART: Translating to English:  கண்ணா பண்ணிங்கிதான் கூட்டும் கூட்டும் மாறு, சிங்கும் சிங்கிலாதான் வரு.
mBART: Detected language code: ta
mBART: Detected language code: ta


Your input_length: 21 is bigger than 0.9 * max_length: 20. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)


mBART: Translation completed.
Sarvam Translate: Translating to English:  கண்ணா பண்ணிங்கிதான் கூட்டும் கூட்டும் மாறு, சிங்கும் சிங்கிலாதான் வரு.


KeyboardInterrupt: 

In [None]:
print('Original transcription:', transcription)
print('Whisper translation:', whisper_translation)
print('mBART translation:', mbart_translation)
print('Sarvam translation:', sarvam_translate)

'And the lion and the dragon shall come, and the lion and the dragon shall come.'

In [None]:
# from transformers import T5ForConditionalGeneration, AutoTokenizer
# t5_model = T5ForConditionalGeneration.from_pretrained('t5-small')
# t5_tokenizer = AutoTokenizer.from_pretrained('t5-small')
# def t5_translate_to_english(text):
#     print(f'T5: Translating to English: {text}')
#     prompt = f'translate to English: {text}'
#     input_ids = t5_tokenizer.encode(prompt, return_tensors='pt')
#     translated_ids = t5_model.generate(input_ids)
#     translated_text = t5_tokenizer.decode(translated_ids[0], skip_special_tokens=True)
#     print('T5: Translation completed.')
#     return translated_text
# # Example usage:
# t5_translation = t5_translate_to_english(transcription)
# print('T5 English translation:', t5_translation)