In [1]:
import whisper
import torch
from transformers import MarianMTModel, MarianTokenizer
import gradio as gr
from sentence_transformers import SentenceTransformer, util

In [4]:
from huggingface_hub import login

In [5]:
import os

In [6]:
os.environ["HUGGINGFACE_HUB_TOKEN"] = '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'

In [7]:
# Load the Whisper model
whis_model = whisper.load_model("base")

In [8]:
# Load Sentence Transformer for retrieval
retrive_model = SentenceTransformer('all-MiniLM-L6-v2')

In [9]:
# Dummy RAG document store
doc = {
    "doc1": "RAG is a generative model that can be used for variety of tasks including speech recognition , translation and summarization.",
    "doc2": "Building a multilingual speech recognition without training. ",
    "doc3": "Here we discuss that the field of multilingual AI is rapidly evolving",
    "doc4": "Multilingual embedding models are essential for RAG systems, enabling robust cross-lingual information retrieval and generation.",
    "doc5": [
        "Key considerations for choosing a multilingual embedding model include language coverage, dimensionality, and integration ease",
        "In the era of global communication, developing effective multilingual AI systems has become increasingly important.",
        "Multilingual-RAG is built upon the powerful architecture of Large Language Models (LLMs) with Retrieve-And-Generate (RAG) capabilities",
        "multilingual speech recognition  means you can now speak naturally in your preferred language and have your device or computer understand you perfectly, no matter what language you're speaking.",
        "This project aims to make it easier for people to access information and use devices in their preferred language",
        "Education will benefit from personalized learning experiences tailored to individual student needs."
    ]
}

# Flatten the document content for encoding
doc_text = []
for key, value in doc.items():
    if isinstance(value, list):
        doc_text.extend(value)
    else:
        doc_text.append(value)

# Encode the documents using the retriever model
doc_embedding = retrive_model.encode(doc_text, convert_to_tensor=True)

In [10]:
# Function to transcribe speech using Whisper
def transcribe_audio(audio_path):
    result = whis_model.transcribe(audio_path)
    return result["text"]

# Function to detect language
def d_language(audio_path):
    audio = whisper.load_audio(audio_path)
    audio = whisper.pad_or_trim(audio)
    mel = whisper.log_mel_spectrogram(audio).to(whis_model.device)
    _, probs = whis_model.detect_language(mel)
    detect_lang = max(probs, key=probs.get)

    # Map detected language codes to readable names
    language_map = {
        'en': 'English', 'es': 'Spanish', 'fr': 'French', 'de': 'German',
        'hi': 'Hindi', 'ja': 'Japanese', 'ru': 'Russian', 'ar': 'Arabic',
        'te': 'Telugu', 'zh': 'Chinese', 'pt': 'Portuguese'
    }

    return language_map.get(detect_lang, detect_lang).capitalize()

In [77]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [11]:
# Function to load translation model and tokenizer
def load_translation(source_language, target_language):
    model_name = {
        "English-Hindi": "Helsinki-NLP/opus-mt-en-hi",
        "English-Spanish": "Helsinki-NLP/opus-mt-en-es",
        "English-Japanese": "Helsinki-NLP/opus-mt-en-jap",
        "English-German": "Helsinki-NLP/opus-mt-en-de",
        "English-Russian": "Helsinki-NLP/opus-mt-en-ru",
        "English-Arabic": "Helsinki-NLP/opus-mt-en-ar",
        "English-Telugu": "Helsinki-NLP/opus-mt-en-te",
        "English-French": "Helsinki-NLP/opus-mt-en-fr",
        "English-Italian": "Helsinki-NLP/opus-mt-en-it",
        "Hindi-English": "Helsinki-NLP/opus-mt-hi-en",
        "Spanish-English": "Helsinki-NLP/opus-mt-es-en",
        "Japanese-English": "Helsinki-NLP/opus-mt-jap-en",
        "German-English": "Helsinki-NLP/opus-mt-de-en",
        "Russian-English": "Helsinki-NLP/opus-mt-ru-en",
        "Arabic-English": "Helsinki-NLP/opus-mt-ar-en",
        "Telugu-English": "Helsinki-NLP/opus-mt-te-en",
        "French-English": "Helsinki-NLP/opus-mt-fr-en",
        "Italian-English": "Helsinki-NLP/opus-mt-it-en",
    }
    key = f"{source_language}-{target_language}"
    if key not in model_name:
        raise ValueError(f"Translation model for {source_language} to {target_language} not available.")

    translate_model = MarianMTModel.from_pretrained(model_name[key])
    translate_tokenizer = MarianTokenizer.from_pretrained(model_name[key])
    return translate_model, translate_tokenizer

In [12]:
# Function to translate text
def translate_t(text, model, tokenizer):
    input = tokenizer(text, return_tensors="pt", padding=True)
    with torch.no_grad():
        translated_token = model.generate(**input)
    translation = tokenizer.decode(translated_token[0], skip_special_tokens=True)
    return translation

In [13]:
def retrieve_doc(query):
    q_embedding = retrive_model.encode(query, convert_to_tensor=True)
    score = util.pytorch_cos_sim(q_embedding, doc_embedding)[0]

    # Ensure there are valid scores
    if len(score) == 0 or torch.isnan(score).any():
        raise ValueError("No valid scores found for the query.")

    top_score_idx = score.argmax().item()

    # Validate the index
    if 0 <= top_score_idx < len(doc_text):
        return doc_text[top_score_idx]
    else:
        raise IndexError("Top score index out of range.")

In [15]:
# Function to process the audio file and return transcriptions and translations
def process_audio(audio, target_language):
    # Transcribe the audio
    transcription = transcribe_audio(audio)

    # Detect the language spoken in the audio
    detected_language = d_language(audio)

    # Translate the transcribed text to English if it's not in English
    if detected_language != "English":
        translation_model, translation_tokenizer = load_translation(detected_language, "English")
        transcription = translate_t(transcription, translation_model, translation_tokenizer)
        
    # Load the appropriate translation model to the target language
    translation_model, translation_tokenizer = load_translation("English", target_language)
    
    # Translate the transcribed text to the target language
    translated_text = translate_t(transcription, translation_model, translation_tokenizer)
    
    # Retrieve document based on the transcribed text
    retrieved_document = retrieve_doc(transcription)

    return transcription, detected_language, translated_text, retrieved_document

# Create the Gradio interface
iface = gr.Interface(
    fn=process_audio,
    inputs=[
        gr.Audio(type="filepath"),
        gr.Dropdown(["Hindi", "Spanish", "Japanese", "German", "Russian", "Arabic", "French", "Italian", "English"], label="Target Language")
    ],
    outputs=[
        gr.Textbox(label="Transcription"),
        gr.Textbox(label="Detected Language"),
        gr.Textbox(label="Translation"),
        gr.Textbox(label="Retrieved Document")
    ],
    title="Multilingual Speech Recognition, Translation, and Document Retrieval",
    description="Upload an audio file in any language, select a target language to get the transcription, translation, and retrieve a document based on the transcription."
)

# Launch the Gradio interface
iface.launch()

Running on local URL:  http://127.0.0.1:7861

To create a public link, set `share=True` in `launch()`.




