In [1]:
pip install langdetect

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
     ---------------------------------------- 0.0/981.5 kB ? eta -:--:--
     ---------------------------------------- 10.2/981.5 kB ? eta -:--:--
     -- ---------------------------------- 61.4/981.5 kB 812.7 kB/s eta 0:00:02
     ------- ------------------------------ 194.6/981.5 kB 1.7 MB/s eta 0:00:01
     ----------------- -------------------- 450.6/981.5 kB 2.8 MB/s eta 0:00:01
     -------------------------------------  972.8/981.5 kB 4.7 MB/s eta 0:00:01
     -------------------------------------- 981.5/981.5 kB 4.4 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py): started
  Building wheel for langdetect (setup.py): finished with status 'done'
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993253 sha256=06e5ea96081a3

In [1]:
# pip install boto3

In [2]:
# pip install awscli

In [21]:
# pip install sentencepiece

Collecting sentencepiece
  Obtaining dependency information for sentencepiece from https://files.pythonhosted.org/packages/a2/f6/587c62fd21fc988555b85351f50bbde43a51524caafd63bc69240ded14fd/sentencepiece-0.2.0-cp311-cp311-win_amd64.whl.metadata
  Downloading sentencepiece-0.2.0-cp311-cp311-win_amd64.whl.metadata (8.3 kB)
Downloading sentencepiece-0.2.0-cp311-cp311-win_amd64.whl (991 kB)
   ---------------------------------------- 0.0/991.5 kB ? eta -:--:--
   ---------------------------------------- 10.2/991.5 kB ? eta -:--:--
   --- ------------------------------------ 92.2/991.5 kB 1.3 MB/s eta 0:00:01
   ------- -------------------------------- 184.3/991.5 kB 1.6 MB/s eta 0:00:01
   -------------------- ------------------- 512.0/991.5 kB 3.2 MB/s eta 0:00:01
   --------------------------------- ------ 839.7/991.5 kB 4.1 MB/s eta 0:00:01
   ---------------------------------------- 991.5/991.5 kB 4.8 MB/s eta 0:00:00
Installing collected packages: sentencepiece
Successfully installed 

In [3]:
import gradio as gr
from transformers import pipeline, AutoTokenizer, TFAutoModelForSeq2SeqLM
from dotenv import load_dotenv
import os
import subprocess
import torch
#Google Text to Speech
from gtts import gTTS
import tempfile
from langdetect import detect

import boto3

In [4]:
# Load environment variables.
load_dotenv()

# Set the model name for our LLMs.
OPENAI_MODEL = "gpt-3.5-turbo"
# Store the API key in a variable.
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [5]:
# This function transcribes audio to text using Whisper in the original language it was spoken
def transcribe_audio_original(audio_filepath):
    try:
        transcription_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-large")
        transcription_result = transcription_pipeline(audio_filepath)
        transcribed_text = transcription_result['text']
        return transcribed_text
    except Exception as e:
        print(f"an error occured: {e}")
        return "Error in transcription"

In [6]:
from transformers import pipeline
from transformers import MarianMTModel, MarianTokenizer

# This function transcribes audio to text and then translates it into the specified language
def translate(transcribed_text, target_lang="es"):
    try:
        #Define the model and tokenizer
        src_lang = detect(transcribed_text)
        model_name =f"Helsinki-NLP/opus-mt-{src_lang}-{target_lang}"
        tokenizer = MarianTokenizer.from_pretrained(model_name)
        model = MarianMTModel.from_pretrained(model_name)
        
        #tokenize the text
        encoded_text = tokenizer(transcribed_text, return_tensors="pt", padding=True)
        
        #generate translation using the model
        translated_tokens = model.generate(**encoded_text)
        
        #decode the translated tokens
        translated_text = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
        
        return translated_text
        
    except Exception as e:
        print(f"An error occurred: {e}")
        return "Error in transcription or translation"

In [14]:
# test funciton to make sure it works
transcribed_text = transcribe_audio_original('speech.mp3')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [15]:
print(f"{transcribed_text}")


 Hello, how are you? My name is Gregory and I need to say more words to find out how is sound.


In [16]:
# text = "trying to see if we can translate this into Spanish"
translated_text = translate(transcribed_text, "en", "es")
print("Translated text:", translated_text)



Translated text: Hola, ¿cómo estás? Mi nombre es Gregory y necesito decir más palabras para averiguar cómo suena.


In [7]:
# Define function to translate text to speech for output
# Uses Google Text-to-speech

def text_to_speech(text):
    tts = gTTS(text, lang='en')
    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp3')
    tts.save(temp_file.name)
    return temp_file.name

In [27]:
# Create a voice map so that correct voice is selected based on target language.

voice_map = {
    "en": "Gregory",
    "es": "Mia",
    "fr": "Liam",
    "de": "Vicki",
    "it": "Bianca",
    "pt": "Camila",
    "cmn-CN": "Zhiyu"
    
}

In [20]:
# Define text-to-speech function using Amazon Polly

def polly_text_to_speech(text, lang_code):
    
    try:
    
        #get the appropriate voice ID from the mapping
        voice_id = voice_map[lang_code]
        
        #initialize boto3 client for polly
        polly_client = boto3.client('polly')
        
        #request speech synthesis
        response = polly_client.synthesize_speech(
            Engine = 'neural',
            Text=text,
            OutputFormat='mp3',
            VoiceId=voice_id
        )
        
        # Save the audio to a temporary file and return its path
        if "AudioStream" in response:
            with tempfile.NamedTemporaryFile(delete=False, suffix='.mp3') as audio_file:
                audio_file.write(response['AudioStream'].read())
                return audio_file.name
    except boto3.exceptions.Boto3Error as e:
        print(f"Error accessing Polly: {e}")
    return None  # Return None if there was an error
    

In [21]:
# Create a language map from full names to ISO codes
language_map = {
    "English": "en",
    "Spanish": "es",
    "French": "fr",
    "German": "de",
    "Italian": "it",
    "Portuguese": "pt",
    "Chinese (Mandarin)": "cmn-CN",
}

In [24]:
def combined_function (audio_filepath, target_lang):
    target_lang = language_map[target_lang]
    transcribed_text = transcribe_audio_original(audio_filepath)
    speech = polly_text_to_speech(transcribed_text, target_lang)
    translation = translate(transcribed_text, target_lang)
    translated_speech = polly_text_to_speech(translation, target_lang)
    return transcribed_text, translation, speech, translated_speech

In [25]:
# list of languages and their codes for dropdown
languages = gr.Dropdown(label="Select Translation Language", choices=list(language_map.keys()))

In [28]:
# Create Gradio app to:
# 1. transcribe spoken audio to text
# 2. output transcribed text as speech

input_audio = gr.Audio(
            label="click on microphone to record audio", 
            type="filepath", 
            #vaveform options customize the color of the wave seen when recording/playing.
            waveform_options = gr.WaveformOptions(
                waveform_color="#01C6FF",
                waveform_progress_color="#0066B4",
                skip_length=2,
                show_controls=False,
            ),
)

app = gr.Interface(
    fn=combined_function,
    inputs=[input_audio, languages],
    outputs=[
        gr.Textbox(label="Transcribed audio"),
        gr.Textbox(label="Translated text"),
        gr.Audio(label="English speech"),
        gr.Audio(label="Translated speech")],
    title="Audio Transcription and Text to Speech",
    description="Record your question and choose a translation language.")
    
app.launch(show_error=True, share=True) #uncomment share=True in google colab

Running on local URL:  http://127.0.0.1:7872
Running on public URL: https://c901977695be706412.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




ERROR:    Exception in ASGI application
Traceback (most recent call last):
  File "c:\Users\mered\anaconda3\Lib\site-packages\uvicorn\protocols\http\httptools_impl.py", line 411, in run_asgi
    result = await app(  # type: ignore[func-returns-value]
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\mered\anaconda3\Lib\site-packages\uvicorn\middleware\proxy_headers.py", line 69, in __call__
    return await self.app(scope, receive, send)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\mered\anaconda3\Lib\site-packages\fastapi\applications.py", line 1054, in __call__
    await super().__call__(scope, receive, send)
  File "c:\Users\mered\anaconda3\Lib\site-packages\starlette\applications.py", line 123, in __call__
    await self.middleware_stack(scope, receive, send)
  File "c:\Users\mered\anaconda3\Lib\site-packages\starlette\middleware\errors.py", line 186, in __call__
    raise exc
  File "c:\Users\mered\anaconda3\Lib\site-packages\starlett

An error occurred: Helsinki-NLP/opus-mt-en-cmn-CN is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`


In [None]:
# create combined function with both transcribe audio and text_to_speech

def combined_function (audio_filepath, language_option):
    # language = language_option[0]
    transcribed_text = transcribe_audio_original(audio_filepath)
    
    # map languages to Polly voice IDs
    voice_map = {
        "English": "Joanna",
        "Spanish": "Conchita",
        "French": "Celine",
        "German": "Marlene",
        "Chinese": "Zhiyu"
    }
    voice_id = voice_map.get(language_option, "Joanna")
    transcribed_text_new = transcribe_and_translate(audio_filepath)
    speech_file_path = polly_text_to_speech(transcribed_text, voice_id)
    return transcribed_text, transcribed_text_new, speech_file_path


In [None]:
def test_function(audio_filepath, language):
    return str(language)  # This will show what `language` is being received as.


In [None]:
# Create Gradio app to:
# 1. transcribe spoken audio to text
# 2. output transcribed text as speech

languages = ["English", "Spanish", "French", "German", "Chinese"]
input_audio = gr.Audio(label="Click on the microphone to record audio", type="filepath",
                 waveform_options=gr.WaveformOptions(
                     waveform_color="#01C6FF",
                     waveform_progress_color="#0066B4",
                     skip_length=2,
                     show_controls=False,
                     )
)
dropdown = gr.Dropdown(languages, label="Select Language")

app = gr.Interface(
    fn=combined_function,
    inputs=[input_audio, dropdown],
    outputs=[
        gr.Textbox(label="Transcribed audio"),
        gr.Audio(label="Text to speech output")],
    title="Audio Transcription and Text to Speech",
    description="Select a language, click on the microphone to record audio, then receive transcription in text and speech.")
    
app.launch(show_error=True) #share=True) #uncomment share=True in google colab

In [None]:
# Create Gradio app to:
# 1. transcribe spoken audio to text
# 2. output transcribed text as speech

languages = ["English", "Spanish", "French", "German", "Chinese"]
input_audio = gr.Audio(label="Click on the microphone to record audio", type="filepath",
                 waveform_options=gr.WaveformOptions(
                     waveform_color="#01C6FF",
                     waveform_progress_color="#0066B4",
                     skip_length=2,
                     show_controls=False,
                     ))
dropdown = gr.Dropdown(choices=["English", "Spanish", "French", "German", "Chinese"], label="Select Language")

app = gr.Interface(
    fn=combined_function,
    inputs=[input_audio, dropdown],
    outputs=[
        gr.Textbox(label="Transcribed audio"),
        gr.Audio(label="Text to speech output")],
    title="Audio Transcription and Text to Speech",
    description="Select a language, click on the microphone to record audio, then receive transcription in text and speech.")
    
app.launch(show_error=True) #share=True) #uncomment share=True in google colab