In [1]:
import logging
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
import speech_recognition as sr
import pyttsx3

In [2]:
# Setting up logging for better visibility of the process
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Initialize the speech recognition model once
recognizer = sr.Recognizer()

# Initialize the translation pipeline
translation_pipeline = pipeline("translation", model="Helsinki-NLP/opus-mt-en-fr")

In [3]:
def speech_to_text_from_mic(source_language):
    try:
        with sr.Microphone() as source:
            logger.info("Please speak now...")
            recognizer.adjust_for_ambient_noise(source)  # Adjusting for ambient noise
            audio = recognizer.listen(source)  # Listen to the user's speech
            text = recognizer.recognize_google(audio, language=source_language)
            logger.info(f"Recognized Text: {text}")
            return text
    except sr.UnknownValueError:
        logger.error("Could not understand the audio.")
    except sr.RequestError as e:
        logger.error(f"Could not request results from the speech recognition service; {e}")
    except Exception as e:
        logger.error(f"An unexpected error occurred during speech-to-text conversion: {e}")
    return None

In [4]:
def translate_text(text, source_lang_model="Helsinki-NLP/opus-mt-en-fr"):
    try:
        logger.info(f"Translating text: {text}")
        translated_text = translation_pipeline(text)[0]['translation_text']
        logger.info(f"Translated Text: {translated_text}")
        return translated_text
    except Exception as e:
        logger.error(f"An error occurred during translation: {e}")
    return None

In [5]:

def text_to_speech_with_pyttsx3(text):
    try:
        logger.info("Converting text to speech with pyttsx3...")
        engine = pyttsx3.init()
        engine.say(text)
        engine.runAndWait()
    except Exception as e:
        logger.error(f"An error occurred during text-to-speech conversion: {e}")


In [6]:
def speech_translation_pipeline_from_mic(source_language):
    # Step 1: Convert Speech to Text (from microphone)
    recognized_text = speech_to_text_from_mic(source_language)
    if not recognized_text:
        logger.error("No recognized text. Exiting pipeline.")
        return

    # Step 2: Translate Text
    translated_text = translate_text(recognized_text)
    if not translated_text:
        logger.error("Translation failed. Exiting pipeline.")
        return

    # Step 3: Convert Text to Speech
    text_to_speech_with_pyttsx3(translated_text)


In [7]:
# Input Parameters
source_language = "en-US"  # Language code of the input audio (e.g., "en-US" for English)

# Run the pipeline
speech_translation_pipeline_from_mic(source_language)


INFO:__main__:Please speak now...
INFO:__main__:Recognized Text: hello
INFO:__main__:Translating text: hello
INFO:__main__:Translated Text: Bonjour.
INFO:__main__:Converting text to speech with pyttsx3...
INFO:comtypes.client._code_cache:Imported existing <module 'comtypes.gen' from 'd:\\SEM5\\tts_env\\lib\\site-packages\\comtypes\\gen\\__init__.py'>
INFO:comtypes.client._code_cache:Using writeable comtypes cache directory: 'd:\SEM5\tts_env\lib\site-packages\comtypes\gen'
