<a href="https://colab.research.google.com/github/satyam-52/speech-to-text/blob/main/speech_to_text_whisper_hindi.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers torch torchaudio accelerate googletrans==4.0.0rc1

In [None]:
import torch
from transformers import pipeline
from google.colab import files
from googletrans import Translator
import IPython.display as ipd
import os

In [None]:
def initialize_models():
    """Initialize transcription model and Google Translator"""
    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    print(f"Using device: {device}")

    # Transcription model
    print("Loading Whisper Hindi Large-v2 model...")
    transcribe = pipeline(
        task="automatic-speech-recognition",
        model="vasista22/whisper-hindi-large-v2",
        chunk_length_s=30,
        device=device
    )

    transcribe.model.config.forced_decoder_ids = transcribe.tokenizer.get_decoder_prompt_ids(
        language="hi",
        task="transcribe"
    )

    # Google Translator
    print("Initializing Google Translator...")
    translator = Translator()

    print("Models loaded successfully!")
    return transcribe, translator

In [None]:
def transcribe_and_translate_google(audio_path, transcribe_model, translator):
    """
    Transcribe Hindi audio and translate to English using Google Translate
    """
    try:
        # Transcribe
        print("Transcribing audio...")
        result = transcribe_model(audio_path)
        hindi_text = result["text"]

        # Translate using Google Translate
        print("Translating to English using Google Translate...")
        translation = translator.translate(hindi_text, src='hi', dest='en')
        english_text = translation.text

        return hindi_text, english_text

    except Exception as e:
        print(f"Error: {e}")
        return None, None

In [None]:
# Test the translation with your example
def test_translation():
    translator = Translator()
    test_text = "मेरा नाम सत्यम है"
    result = translator.translate(test_text, src='hi', dest='en')
    print(f"Test Hindi: {test_text}")
    print(f"Test English: {result.text}")

In [None]:
# Run test first
print("Testing translation accuracy:")
test_translation()

# Initialize models
transcriber, translator = initialize_models()

# Upload and process audio
print("\nPlease upload your Hindi audio file:")
uploaded = files.upload()
audio_file = list(uploaded.keys())[0]

# Process the audio
hindi_transcription, english_translation = transcribe_and_translate_google(
    audio_file, transcriber, translator
)

if hindi_transcription and english_translation:
    print("\n" + "="*60)
    print("RESULTS:")
    print("="*60)
    print("📝 HINDI TRANSCRIPTION:")
    print(hindi_transcription)
    print("\n🔄 ENGLISH TRANSLATION (Google Translate):")
    print(english_translation)
    print("="*60)
else:
    print("Processing failed!")


# Save Hindi transcription
hindi_filename = f"{audio_file}_hindi_transcription.txt"
with open(hindi_filename, 'w', encoding='utf-8') as f:
    f.write(hindi_transcription)

# Save English translation
english_filename = f"{audio_file}_english_translation.txt"
with open(english_filename, 'w', encoding='utf-8') as f:
    f.write(english_translation)

# Save combined results
combined_filename = f"{audio_file}_transcription_and_translation.txt"
with open(combined_filename, 'w', encoding='utf-8') as f:
    f.write("HINDI TRANSCRIPTION:\n")
    f.write("=" * 30 + "\n")
    f.write(hindi_transcription + "\n\n")
    f.write("ENGLISH TRANSLATION:\n")
    f.write("=" * 30 + "\n")
    f.write(english_translation + "\n")

print(f"\nFiles saved:")
print(f"- Hindi transcription: {hindi_filename}")
print(f"- English translation: {english_filename}")
print(f"- Combined file: {combined_filename}")

# Download the files
files.download(hindi_filename)
files.download(english_filename)
files.download(combined_filename)
