<a href="https://colab.research.google.com/github/santhosh220z/ml-projects/blob/main/speech_to_text_translator_using_hugging_face_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
 pip install transformers librosa torch



In [None]:
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, AutoTokenizer, AutoModelForSeq2SeqLM
import librosa
import torch

def preprocess_audio(audio_path):
    audio, sample_rate = librosa.load(audio_path, sr=16000)
    return torch.tensor(audio), sample_rate

def speech_to_text(audio_path):
    print("Converting speech to text...")
    processor = AutoProcessor.from_pretrained("openai/whisper-base")
    model = AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-base")

    audio, sample_rate = preprocess_audio(audio_path)
    inputs = processor(audio, sampling_rate=sample_rate, return_tensors="pt")

    with torch.no_grad():
        generated_ids = model.generate(inputs["input_features"])

    transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)
    return transcription[0]

def translate_text_hindi(text, target_language="hindi"):
    print("Translating text to", target_language, "...")
    tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-hi")
    model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-hi")

    inputs = tokenizer.encode(text, return_tensors="pt", padding=True)
    outputs = model.generate(inputs)
    translation = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return translation

def translate_text_french(text, target_language="french"):
    print("Translating text to", target_language, "...")
    tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-fr")
    model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-fr")

    inputs = tokenizer.encode(text, return_tensors="pt", padding=True)
    outputs = model.generate(inputs)
    translation = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return translation

def translate_text_spanich(text, target_language="spanich"):
    print("Translating text to", target_language, "...")
    tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-es")
    model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-es")

    inputs = tokenizer.encode(text, return_tensors="pt", padding=True)
    outputs = model.generate(inputs)
    translation = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return translation

def translate_text_japanese(text, target_language="japanese"):
    print("Translating text to", target_language, "...")
    tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-jap")
    model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-jap")

    inputs = tokenizer.encode(text, return_tensors="pt", padding=True)
    outputs = model.generate(inputs)
    translation = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return translation



if __name__ == "__main__":
    audio_file_path = "village dialogue.wav"
    text = speech_to_text(audio_file_path)
    lang = input("Enter the language:")
    if lang == "hindi":
        target_lang = "hindi"
        translated_text = translate_text_hindi(text, target_lang)
        print("\nTranscription:", text)
        print("\nTranslated text:", translated_text)

    elif lang == "spanich":
        target_lang = "spanich"
        translated_text = translate_text_spanich(text, target_lang)
        print("\nTranscription:", text)
        print("\nTranslated text:", translated_text)

    elif lang == "japanese":
        target_lang = "japanese"
        translated_text = translate_text_tamil(text, target_lang)
        print("\nTranscription:", text)
        print("\nTranslated text:", translated_text)

    else:
        target_lang = "french"
        translated_text = translate_text_french(text, target_lang)
        print("\nTranscription:", text)
        print("\nTranslated text:", translated_text)

Converting speech to text...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.98k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/290M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.81k [00:00<?, ?B/s]

Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Enter the language:hindi
Translating text to hindi ...


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/812k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/1.07M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.10M [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/306M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]


Transcription:  I've never been out of the village before. I've never been out of the village before. But this time, I'm truly free.

Translated text: मैं पहले गांव से बाहर कभी नहीं किया गया है. मैं गांव से पहले कभी नहीं किया गया है. लेकिन इस बार, मैं वास्तव में स्वतंत्र हूँ.
