In [4]:
# ============================
# 🌍 Sheikh Taha's Multimodal Translator Chatbot
# Final Internship Project (One-Page Version)
# ============================

!pip install -q gradio transformers sentencepiece accelerate safetensors timm gTTS diffusers pillow

import os, time, torch
import gradio as gr
from PIL import Image, ImageDraw
from gtts import gTTS
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

# -------- Settings --------
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN", None)
PAIR2MODEL = {"en>ur":"Helsinki-NLP/opus-mt-en-ur","ur>en":"Helsinki-NLP/opus-mt-ur-en",
              "en>fr":"Helsinki-NLP/opus-mt-en-fr","fr>en":"Helsinki-NLP/opus-mt-fr-en",
              "en>es":"Helsinki-NLP/opus-mt-en-es","es>en":"Helsinki-NLP/opus-mt-es-en"}
LANGS = {"en":"English","ur":"Urdu","fr":"French","es":"Spanish"}
GTTS_LANG = {"en":"en","ur":"ur","fr":"fr","es":"es"}
_translators, _asr, _captioner, _sd = {}, None, None, None

# -------- Translation --------
def load_translator(pair):
    if pair not in _translators:
        tok = AutoTokenizer.from_pretrained(PAIR2MODEL[pair])
        mdl = AutoModelForSeq2SeqLM.from_pretrained(PAIR2MODEL[pair]).to(DEVICE)
        _translators[pair] = (tok, mdl)
    return _translators[pair]

def translate(text, src, tgt):
    if src == tgt: return text
    if src != "en":
        tok, mdl = load_translator(f"{src}>en")
        out = mdl.generate(**tok(text, return_tensors="pt", truncation=True).to(DEVICE), max_new_tokens=256)
        text = tok.decode(out[0], skip_special_tokens=True)
    if tgt != "en":
        tok, mdl = load_translator(f"en>{tgt}")
        out = mdl.generate(**tok(text, return_tensors="pt", truncation=True).to(DEVICE), max_new_tokens=256)
        text = tok.decode(out[0], skip_special_tokens=True)
    return text

# -------- Text-to-Speech --------
def text_to_speech(text, lang):
    fn = f"tts_{int(time.time()*1000)}.mp3"
    gTTS(text=text, lang=GTTS_LANG.get(lang,"en")).save(fn)
    return fn

# -------- ASR --------
def asr(audio_path):
    global _asr
    if _asr is None:
        _asr = pipeline("automatic-speech-recognition","openai/whisper-base",device=0 if DEVICE=="cuda" else -1)
    return _asr(audio_path)["text"]

# -------- Image Captioning --------
def caption_image(img):
    global _captioner
    if _captioner is None:
        _captioner = pipeline("image-to-text","nlpconnect/vit-gpt2-image-captioning",device=0 if DEVICE=="cuda" else -1)
    return _captioner(img)[0]["generated_text"]

# -------- Image Generation --------
def fallback_poster(prompt,w=512,h=512):
    img = Image.new("RGB",(w,h),(40,40,70)); d=ImageDraw.Draw(img)
    d.text((20,20),"\n".join([prompt[i:i+25] for i in range(0,len(prompt),25)]),fill=(255,255,255)); return img

def generate_image(prompt):
    global _sd
    if _sd is None:
        try:
            from diffusers import StableDiffusionPipeline
            _sd = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5",
                    use_auth_token=HUGGINGFACE_TOKEN,torch_dtype=torch.float16 if DEVICE=="cuda" else torch.float32).to(DEVICE)
        except Exception: return fallback_poster(prompt)
    return _sd(prompt, num_inference_steps=20).images[0]

# -------- Gradio UI --------
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 🌐 Multimodal Translator: Text ↔ Voice ↔ Image")

    with gr.Tab("📝 Text ➜ 🎤 Voice"):
        src=gr.Dropdown(list(LANGS.keys()),value="en",label="Source")
        tgt=gr.Dropdown(list(LANGS.keys()),value="ur",label="Target")
        inp=gr.Textbox(label="Enter text"); out_txt=gr.Textbox(label="Translated"); out_aud=gr.Audio(type="filepath")
        gr.Button("Translate & Speak").click(lambda x,s,t:(translate(x,s,t),text_to_speech(translate(x,s,t),t)),
                                            [inp,src,tgt],[out_txt,out_aud])

    with gr.Tab("🖼️ Image ➜ 🎤 Voice"):
        tgt2=gr.Dropdown(list(LANGS.keys()),value="en",label="Voice Language"); img=gr.Image(type="pil")
        cap_out=gr.Textbox(label="Caption"); aud2=gr.Audio(type="filepath")
        gr.Button("Caption & Speak").click(lambda im,t:(translate(caption_image(im),'en',t),text_to_speech(translate(caption_image(im),'en',t),t)),
                                           [img,tgt2],[cap_out,aud2])

    with gr.Tab("🎤 Voice ➜ 🖼️ Image"):
        tgt3=gr.Dropdown(list(LANGS.keys()),value="en",label="Prompt Lang")
        aud=gr.Audio(sources=["microphone"],type="filepath"); txt=gr.Textbox(); out_img=gr.Image(type="pil")
        def run_voice(a,t): speech=asr(a); en=translate(speech,t,"en") if t!="en" else speech;
        im=generate_image(en); tr=translate(en,"en",t); return tr,im
        gr.Button("Generate Image").click(run_voice,[aud,tgt3],[txt,out_img])

demo.launch(share=True)


[31mERROR: Operation cancelled by user[0m[31m
[0m

SyntaxError: 'return' outside function (ipython-input-1337947970.py, line 100)

In [5]:
# ============================
# 🌍 Sheikh Taha's Multimodal Translator Chatbot
# Final Internship Project (One-Page Fixed Version)
# ============================

!pip install -q gradio transformers sentencepiece accelerate safetensors timm gTTS diffusers pillow

import os, time, torch
import gradio as gr
from PIL import Image, ImageDraw
from gtts import gTTS
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

# -------- Settings --------
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN", None)
PAIR2MODEL = {"en>ur":"Helsinki-NLP/opus-mt-en-ur","ur>en":"Helsinki-NLP/opus-mt-ur-en",
              "en>fr":"Helsinki-NLP/opus-mt-en-fr","fr>en":"Helsinki-NLP/opus-mt-fr-en",
              "en>es":"Helsinki-NLP/opus-mt-en-es","es>en":"Helsinki-NLP/opus-mt-es-en"}
LANGS = {"en":"English","ur":"Urdu","fr":"French","es":"Spanish"}
GTTS_LANG = {"en":"en","ur":"ur","fr":"fr","es":"es"}
_translators, _asr, _captioner, _sd = {}, None, None, None

# -------- Translation --------
def load_translator(pair):
    if pair not in _translators:
        tok = AutoTokenizer.from_pretrained(PAIR2MODEL[pair])
        mdl = AutoModelForSeq2SeqLM.from_pretrained(PAIR2MODEL[pair]).to(DEVICE)
        _translators[pair] = (tok, mdl)
    return _translators[pair]

def translate(text, src, tgt):
    if src == tgt: return text
    if src != "en":
        tok, mdl = load_translator(f"{src}>en")
        out = mdl.generate(**tok(text, return_tensors="pt", truncation=True).to(DEVICE), max_new_tokens=256)
        text = tok.decode(out[0], skip_special_tokens=True)
    if tgt != "en":
        tok, mdl = load_translator(f"en>{tgt}")
        out = mdl.generate(**tok(text, return_tensors="pt", truncation=True).to(DEVICE), max_new_tokens=256)
        text = tok.decode(out[0], skip_special_tokens=True)
    return text

# -------- Text-to-Speech --------
def text_to_speech(text, lang):
    fn = f"tts_{int(time.time()*1000)}.mp3"
    gTTS(text=text, lang=GTTS_LANG.get(lang,"en")).save(fn)
    return fn

# -------- ASR --------
def asr(audio_path):
    global _asr
    if _asr is None:
        _asr = pipeline("automatic-speech-recognition","openai/whisper-base",device=0 if DEVICE=="cuda" else -1)
    return _asr(audio_path)["text"]

# -------- Image Captioning --------
def caption_image(img):
    global _captioner
    if _captioner is None:
        _captioner = pipeline("image-to-text","nlpconnect/vit-gpt2-image-captioning",device=0 if DEVICE=="cuda" else -1)
    return _captioner(img)[0]["generated_text"]

# -------- Image Generation --------
def fallback_poster(prompt,w=512,h=512):
    img = Image.new("RGB",(w,h),(40,40,70)); d=ImageDraw.Draw(img)
    d.text((20,20),"\n".join([prompt[i:i+25] for i in range(0,len(prompt),25)]),fill=(255,255,255)); return img

def generate_image(prompt):
    global _sd
    if _sd is None:
        try:
            from diffusers import StableDiffusionPipeline
            _sd = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5",
                    use_auth_token=HUGGINGFACE_TOKEN,torch_dtype=torch.float16 if DEVICE=="cuda" else torch.float32).to(DEVICE)
        except Exception: return fallback_poster(prompt)
    return _sd(prompt, num_inference_steps=20).images[0]

# -------- Gradio UI --------
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 🌐 Multimodal Translator: Text ↔ Voice ↔ Image")

    with gr.Tab("📝 Text ➜ 🎤 Voice"):
        src=gr.Dropdown(list(LANGS.keys()),value="en",label="Source")
        tgt=gr.Dropdown(list(LANGS.keys()),value="ur",label="Target")
        inp=gr.Textbox(label="Enter text"); out_txt=gr.Textbox(label="Translated"); out_aud=gr.Audio(type="filepath")
        def run_text(x,s,t): tr=translate(x,s,t); aud=text_to_speech(tr,t); return tr,aud
        gr.Button("Translate & Speak").click(run_text,[inp,src,tgt],[out_txt,out_aud])

    with gr.Tab("🖼️ Image ➜ 🎤 Voice"):
        tgt2=gr.Dropdown(list(LANGS.keys()),value="en",label="Voice Language"); img=gr.Image(type="pil")
        cap_out=gr.Textbox(label="Caption"); aud2=gr.Audio(type="filepath")
        def run_img(im,t): cap=caption_image(im); tr=translate(cap,'en',t); mp3=text_to_speech(tr,t); return tr,mp3
        gr.Button("Caption & Speak").click(run_img,[img,tgt2],[cap_out,aud2])

    with gr.Tab("🎤 Voice ➜ 🖼️ Image"):
        tgt3=gr.Dropdown(list(LANGS.keys()),value="en",label="Prompt Lang")
        aud=gr.Audio(sources=["microphone"],type="filepath"); txt=gr.Textbox(); out_img=gr.Image(type="pil")
        def run_voice(a,t):
            speech=asr(a); en=translate(speech,t,"en") if t!="en" else speech
            im=generate_image(en); tr=translate(en,"en",t); return tr,im
        gr.Button("Generate Image").click(run_voice,[aud,tgt3],[txt,out_img])

demo.launch(share=True)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://3650ed34f799ff3f0f.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


