<a href="https://colab.research.google.com/github/shehab0911/Healthcare-Translation-Web-App/blob/main/Colab_File.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install TTS transformers faster-whisper gradio torch
!pip install whisper gradio faster-whisper deep-translator gtts
!pip install faster-whisper
! pip install gradio -q
! pip install git+https://github.com/openai/whisper.git -q

Collecting TTS
  Downloading TTS-0.22.0-cp311-cp311-manylinux1_x86_64.whl.metadata (21 kB)
Collecting faster-whisper
  Downloading faster_whisper-1.1.1-py3-none-any.whl.metadata (16 kB)
Collecting gradio
  Downloading gradio-5.20.0-py3-none-any.whl.metadata (16 kB)
Collecting anyascii>=0.3.0 (from TTS)
  Downloading anyascii-0.3.2-py3-none-any.whl.metadata (1.5 kB)
Collecting pysbd>=0.3.4 (from TTS)
  Downloading pysbd-0.3.4-py3-none-any.whl.metadata (6.1 kB)
Collecting pandas<2.0,>=1.4 (from TTS)
  Downloading pandas-1.5.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting trainer>=0.0.32 (from TTS)
  Downloading trainer-0.0.36-py3-none-any.whl.metadata (8.1 kB)
Collecting coqpit>=0.0.16 (from TTS)
  Downloading coqpit-0.0.17-py3-none-any.whl.metadata (11 kB)
Collecting pypinyin (from TTS)
  Downloading pypinyin-0.53.0-py2.py3-none-any.whl.metadata (12 kB)
Collecting hangul-romanize (from TTS)
  Downloading hangul_romanize-0.1.0-py3-none-any.whl.met

In [2]:
import whisper
import gradio as gr
import time
from faster_whisper import WhisperModel as FasterWhisperModel
from transformers import MarianMTModel, MarianTokenizer
from TTS.api import TTS
import torch
import tempfile
import os
import re


tts_engine = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False, gpu=False)


device = "cuda" if torch.cuda.is_available() else "cpu"
compute_type = "float16" if device == "cuda" else "int8"
model_faster_whisper = FasterWhisperModel("small", device=device, compute_type=compute_type)


translation_model_cache = {}

def load_translation_model(src_lang, tgt_lang):

    cache_key = f"{src_lang}-{tgt_lang}"
    if cache_key not in translation_model_cache:
        model_name = f"Helsinki-NLP/opus-mt-{src_lang}-{tgt_lang}"
        tokenizer = MarianTokenizer.from_pretrained(model_name)
        model = MarianMTModel.from_pretrained(model_name).to(device)
        model = torch.compile(model)
        translation_model_cache[cache_key] = (model, tokenizer)
    return translation_model_cache[cache_key]

def split_text(text, max_length=512):

    sentences = re.split(r'(?<=[.!?])\s+', text)
    chunks, current_chunk = [], ""

    for sentence in sentences:
        if len(current_chunk) + len(sentence) <= max_length:
            current_chunk += " " + sentence
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence

    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks

def translate_text(text, src_lang, tgt_lang):

    if src_lang == tgt_lang or not text.strip():
        return text

    model, tokenizer = load_translation_model(src_lang, tgt_lang)

    text_chunks = split_text(text)


    inputs = tokenizer(text_chunks, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
    translated_tokens = model.generate(**inputs, num_beams=1)
    translated_texts = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)

    return " ".join(translated_texts)


LANGUAGES = {
    "English": "en",
    "Spanish": "es",
    "French": "fr",
    "German": "de",
    "Russian": "ru",
    "Chinese": "zh",
    "Japanese": "ja",
    "Arabic": "ar",
    "Portuguese": "pt",
    "Italian": "it",
    "Korean": "ko",
    "Dutch": "nl",
    "Swedish": "sv",
    "Norwegian": "no",
    "Danish": "da",
    "Finnish": "fi",
    "Greek": "el",
}

def transcribe(audio, use_faster, input_lang_full, output_lang_full):

    try:
        audio_data = whisper.load_audio(audio)
    except Exception as e:
        return f"Error loading audio: {e}", None, None, None


    input_lang = LANGUAGES.get(input_lang_full, "en")
    output_lang = LANGUAGES.get(output_lang_full, "en")


    start_time = time.time()
    if use_faster:
        segments, _ = model_faster_whisper.transcribe(audio_data, language=input_lang, beam_size=1, word_timestamps=False)
        transcription = " ".join([seg.text for seg in segments]).strip()
    else:
        model = whisper.load_model("small")
        result = model.transcribe(audio_data, language=input_lang)
        transcription = result["text"].strip()
    inference_time = time.time() - start_time


    translated_text = translate_text(transcription, input_lang, output_lang)


    try:
        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio_file:
            audio_output_path = temp_audio_file.name

            if output_lang not in tts_engine.languages:
                output_lang = "en"
                print(f"Output language '{output_lang}' is not supported. Falling back to English.")

            speaker = tts_engine.speakers[0] if tts_engine.speakers else None
            if not speaker:
                raise ValueError("No available speakers found in TTS engine.")

            tts_engine.tts_to_file(
                text=translated_text,
                speaker=speaker,
                language=output_lang,
                file_path=audio_output_path
            )

    except Exception as e:
        return f"TTS Error: {e}", None, None, None

    return transcription, inference_time, translated_text, audio_output_path


interface = gr.Interface(
    title="Healthcare Translation Web App",
    fn=transcribe,
    inputs=[
        gr.Audio(type="filepath"),
        gr.Checkbox(label="Use Faster-Whisper"),
        gr.Dropdown(list(LANGUAGES.keys()), label="Input Language", value="English"),
        gr.Dropdown(list(LANGUAGES.keys()), label="Output Language", value="English"),
    ],
    outputs=[
        gr.Textbox(label="Transcription"),
        gr.Number(label="Inference Time (seconds)"),
        gr.Textbox(label="Translated Text"),
        gr.Audio(label="Translated Audio"),
    ],
    live=False
)

interface.launch()


 > Downloading model to /root/.local/share/tts/tts_models--multilingual--multi-dataset--your_tts
 > Model's license - CC BY-NC-ND 4.0
 > Check https://creativecommons.org/licenses/by-nc-nd/4.0/ for more info.
 > Using model: vits
 > Setting up Audio Processor...
 | > sample_rate:16000
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:0
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:None
 | > fft_size:1024
 | > power:None
 | > preemphasis:0.0
 | > griffin_lim_iters:None
 | > signal_norm:None
 | > symmetric_norm:None
 | > mel_fmin:0
 | > mel_fmax:None
 | > pitch_fmin:None
 | > pitch_fmax:None
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:1.0
 | > clip_norm:True
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024


  return torch.load(f, map_location=map_location, **kwargs)


 > Model fully restored. 
 > Setting up Audio Processor...
 | > sample_rate:16000
 | > resample:False
 | > num_mels:64
 | > log_func:np.log10
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:512
 | > power:1.5
 | > preemphasis:0.97
 | > griffin_lim_iters:60
 | > signal_norm:False
 | > symmetric_norm:False
 | > mel_fmin:0
 | > mel_fmax:8000.0
 | > pitch_fmin:1.0
 | > pitch_fmax:640.0
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:False
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:True
 | > db_level:-27.0
 | > stats_path:None
 | > base:10
 | > hop_length:160
 | > win_length:400
 > External Speaker Encoder Loaded !!
 > initialization of language-embedding layers.
 > Model fully restored. 
 > Setting up Audio Processor...
 | > sample_rate:16000
 | > resample:False
 | > num_mels:64
 | > log_func:np.log10

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model.bin:   0%|          | 0.00/484M [00:00<?, ?B/s]

vocabulary.txt:   0%|          | 0.00/460k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.37k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.20M [00:00<?, ?B/s]

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://9b30fdaba262a8dcb1.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


