In [1]:
#!pip install ffmpeg-python

In [1]:
import os
import gradio as gr
import tempfile
import whisper
from pydub import AudioSegment
from openai import OpenAI
from TTS.api import TTS

In [2]:
client = OpenAI(api_key="")

In [4]:
whisper_model = whisper.load_model("small")

In [5]:
tts_model = TTS("tts_models/multilingual/multi-dataset/xtts_v2")

 > tts_models/multilingual/multi-dataset/xtts_v2 is already downloaded.
 > Using model: xtts


  self.speakers = torch.load(speaker_file_path)
  return torch.load(f, map_location=map_location, **kwargs)


In [6]:
LANGUAGES = ["French", "German", "Spanish", "Hindi", "Russian", "Chinese"]

LANGUAGE_CODES = {
    "french": "fr",
    "german": "de",
    "spanish": "es",
    "hindi": "hi",
    "russian": "ru",
    "chinese": "zh-cn"
}

os.makedirs("user_voices", exist_ok=True)
DEFAULT_VOICE_PATH = "user_voices/my_voice.wav"


def save_voice_sample(audio):
    if audio is None:
        return "‚ö†Ô∏è Please record a voice first.", None


    output_path = DEFAULT_VOICE_PATH
    AudioSegment.from_file(audio).export(output_path, format="wav")
    return f"‚úÖ Voice sample saved as {output_path}.", output_path

def translate_from_english(audio, target_lang):
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
        AudioSegment.from_file(audio).export(tmp.name, format="wav")
        audio_path = tmp.name

    result = whisper_model.transcribe(audio_path)
    english_text = result["text"]

    translation = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": f"Translate this from English to {target_lang}."},
            {"role": "user", "content": english_text}
        ]
    )

    translated_text = translation.choices[0].message.content
    lang_code = LANGUAGE_CODES.get(target_lang.lower(), target_lang.lower())

    speaker_wav = DEFAULT_VOICE_PATH if os.path.exists(DEFAULT_VOICE_PATH) else None

    audio_out_path = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name
    tts_model.tts_to_file(
        text=translated_text,
        speaker_wav=speaker_wav,
        language=lang_code,
        file_path=audio_out_path
    )

    return translated_text, audio_out_path


def translate_to_english(audio, source_lang):
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
        AudioSegment.from_file(audio).export(tmp.name, format="wav")
        audio_path = tmp.name

    result = whisper_model.transcribe(audio_path)
    source_text = result["text"]

    translation = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": f"Translate this from {source_lang} to English."},
            {"role": "user", "content": source_text}
             
        ]
    )
    translated_text = translation.choices[0].message.content

    with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as tmp_audio:
        english_audio_out = tmp_audio.name

    response = client.audio.speech.create(
        model="gpt-4o-mini-tts",
        voice="alloy",
        input=translated_text
    )

    response.stream_to_file(english_audio_out)

    return translated_text, english_audio_out

def text_translate(text, target_lang, direction):
    """Translate text in either direction (English <-> Target)."""
    
    if direction == "English ‚Üí Target":
        src_lang = "English"
        tgt_lang = target_lang
    else:
        src_lang = target_lang
        tgt_lang = "English"

    translation = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {
                "role": "system",
                "content": (
                    f"You are a professional translator. Translate ONLY the following text "
                    f"from {src_lang} to {tgt_lang}. "
                    f"Respond with ONLY the translated text ‚Äî no explanations, no quotes, no extra words."
                )
            },
            {"role": "user", "content": text},
        ],
    )

    return translation.choices[0].message.content.strip()


with gr.Blocks(title="üåç AI Voice Translator Studio") as demo:
    gr.Markdown("## üéôÔ∏è Your AI Voice Translator ‚Äî Speak, Record, and Communicate Effortlessly")
    
    with gr.Tab("üß† 1. Record & Save Your Voice"):
        gr.Markdown("Record your voice ‚Äî the app will use it for translations from English.")
        voice_input = gr.Audio(sources=["microphone"], type="filepath", label="üé§ Record your voice sample")
        save_button = gr.Button("üíæ Save Voice Sample")
        voice_status = gr.Textbox(label="Status", interactive=False)
        saved_voice_preview = gr.Audio(label="üîä Your Saved Voice")
        save_button.click(save_voice_sample, inputs=voice_input, outputs=[voice_status, saved_voice_preview])

    with gr.Tab("üåç 2. English ‚Üí Target Language (Your Voice)"):
        gr.Interface(
            fn=translate_from_english,
            inputs=[
                gr.Audio(sources=["microphone"], type="filepath", label="üéôÔ∏è Speak English"),
                gr.Dropdown(LANGUAGES, label="Translate to", value="French")
            ],
            outputs=[
                gr.Textbox(label="üìù Translated Text"),
                gr.Audio(label="üîä Translation (Your Voice)", interactive=True, show_download_button=True)
            ]
        )

    with gr.Tab("üó£Ô∏è 3. Target Language ‚Üí English (AI Voice)"):
        gr.Interface(
            fn=translate_to_english,
            inputs=[
                gr.Audio(sources=["microphone"], type="filepath", label="üéôÔ∏è Speak in Target Language"),
                gr.Dropdown(LANGUAGES, label="Source Language", value="French")
            ],
            outputs=[
                gr.Textbox(label="üìù English Translation"),
                gr.Audio(label="üîä English Voice (GPT Voice)", interactive=True, show_download_button=True)
            ]
        )

    with gr.Tab("üí¨ 4. Text Translator"):
        gr.Interface(
            fn=text_translate,
            inputs=[
                gr.Textbox(label="‚úçÔ∏è Enter text to translate"),
                gr.Dropdown(LANGUAGES, label="üåç Choose Language", value="Spanish"),
                gr.Radio(["English ‚Üí Target", "Target ‚Üí English"], label="Translation Direction", value="English ‚Üí Target")
            ],
            outputs=gr.Textbox(label="üìù Translated Text")
        )

demo.launch()
        

* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.






 > Text splitted to sentences.
['Lib, ‚àíÔøΩÔøΩ Douglas, ‚àí et rabroue sax sur le c√¥t√© Corin mets tag gawon triomphe.']
 > Processing time: 59.74814987182617
 > Real-time factor: 5.801480944276083


Traceback (most recent call last):
  File "C:\Users\USER\miniconda3\envs\audio_translate\lib\site-packages\gradio\queueing.py", line 759, in process_events
    response = await route_utils.call_process_api(
  File "C:\Users\USER\miniconda3\envs\audio_translate\lib\site-packages\gradio\route_utils.py", line 354, in call_process_api
    output = await app.get_blocks().process_api(
  File "C:\Users\USER\miniconda3\envs\audio_translate\lib\site-packages\gradio\blocks.py", line 2116, in process_api
    result = await self.call_function(
  File "C:\Users\USER\miniconda3\envs\audio_translate\lib\site-packages\gradio\blocks.py", line 1623, in call_function
    prediction = await anyio.to_thread.run_sync(  # type: ignore
  File "C:\Users\USER\miniconda3\envs\audio_translate\lib\site-packages\anyio\to_thread.py", line 56, in run_sync
    return await get_async_backend().run_sync_in_worker_thread(
  File "C:\Users\USER\miniconda3\envs\audio_translate\lib\site-packages\anyio\_backends\_asyncio.py"

 > Text splitted to sentences.
['Bonjour, je veux juste savoir si tu vas bien.']
 > Processing time: 46.48140907287598
 > Real-time factor: 6.627748771707936




 > Text splitted to sentences.
['Bonjour, je veux juste savoir comment tu vas.']
 > Processing time: 32.19431161880493
 > Real-time factor: 6.014339934887563


  response.stream_to_file(english_audio_out)


In [5]:
#!pip uninstall -y numpy==2.2.6

In [6]:
#!python -c "import numpy; print(numpy.__version__)"


In [7]:
#!pip install numpy==1.22.0

In [8]:
#!python -c "import numpy; print(numpy.__version__)"


In [9]:
#pip install transformers==4.37.2 --force-reinstall

In [11]:
#!pip install sounddevice scipy

import sounddevice as sd
from scipy.io.wavfile import write

fs = 44100
seconds = 10

print("Recording... Speak now!")

recording = sd.rec(int(seconds * fs), samplerate=fs, channels=1, dtype='int16')

sd.wait()

print("Recording complete!")

write("my_voice1.wav", fs, recording)
print("saved as my_voice1.wav")

from IPython.display import Audio

# use the filename parameter explicitly
Audio(filename="my_voice1.wav")


LANGUAGES = ["French", "German", "Spanish", "Hindi", "Russian", "Chinese"]

# Map display names ‚Üí valid TTS language codes
LANGUAGE_CODES = {
    "french": "fr",
    "german": "de",
    "spanish": "es",
    "hindi": "hi",
    "russian": "ru",
    "chinese": "zh-cn"   # ‚úÖ corrected
}

MY_VOICE_SAMPLE = "my_voice1.wav"

def translate_voice_note(audio, target_lang):
    import tempfile
    from pydub import AudioSegment

    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
        AudioSegment.from_file(audio).export(tmp.name, format="wav")
        audio_path = tmp.name

    result = whisper_model.transcribe(audio_path)
    english_text = result["text"]

    # Translate text using GPT
    translation = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": f"Translate this from English to {target_lang}."},
            {"role": "user", "content": english_text}
        ]
    )

    translated_text = translation.choices[0].message.content

    # ‚úÖ Correct the language code for TTS
    lang_code = LANGUAGE_CODES.get(target_lang.lower(), target_lang.lower())

    audio_out_path = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name
    tts_model.tts_to_file(
        text=translated_text,
        speaker_wav=MY_VOICE_SAMPLE,
        language=lang_code,
        file_path=audio_out_path  # ‚úÖ was previously "audio" by mistake
    )

    return translated_text, audio_out_path


# Gradio interface
demo = gr.Interface(
    fn=translate_voice_note,
    inputs=[
        gr.Audio(sources=["microphone"], type="filepath", label="üéôÔ∏è Record your English voice note"),
        gr.Dropdown(LANGUAGES, label="üåç Choose target language", value="Chinese")
    ],
    outputs=[
        gr.Textbox(label="üìù Translated Text"),
        gr.Audio(label="üîä Translated Voice (Your Voice!)")
    ],
    title="üåç Personal Voice Translator",
    description="Record an English voice note ‚Äî it translates into your chosen language and speaks back in your own voice."
)

demo.launch()
