# Streamlit Multimodal ‚Äî Vision, Audio (Captioning, VQA, TTS/STT)
Interact with multimodal models: image captioning (HF Inference), visual Q&A (Gemini/OpenAI), and optional TTS/STT (commented for safety).


# Installation (commented)

In [None]:
# !pip install streamlit pillow huggingface_hub google-generativeai openai


# Imports & helpers

In [None]:
import os
import io
import streamlit as st
from PIL import Image

def get_hf_client():
    from huggingface_hub import InferenceClient
    return InferenceClient(token=st.secrets.get('HUGGINGFACEHUB_API_TOKEN', os.environ.get('HUGGINGFACEHUB_API_TOKEN')))

def get_openai_client():
    from openai import OpenAI
    if (k:=st.secrets.get('OPENAI_API_KEY', None) if hasattr(st,'secrets') else None) or os.environ.get('OPENAI_API_KEY'):
        os.environ['OPENAI_API_KEY'] = k or os.environ.get('OPENAI_API_KEY','')
    return OpenAI()

def get_gemini_model(name='gemini-1.5-flash'):
    import google.generativeai as genai
    genai.configure(api_key=st.secrets.get('GEMINI_API_KEY', os.environ.get('GEMINI_API_KEY')))
    return genai.GenerativeModel(name)


# UI

In [None]:
st.set_page_config(page_title="Multimodal", page_icon="üñºÔ∏è")
st.title("üñºÔ∏èüéôÔ∏è Multimodal ‚Äî Captioning, VQA, TTS/STT")

with st.sidebar:
    st.header('VQA Provider')
    vqa_provider = st.selectbox('Provider', ['Gemini','OpenAI'], index=0)
    if vqa_provider=='Gemini':
        vqa_model = st.text_input('Gemini model', 'gemini-1.5-flash')
    else:
        vqa_model = st.text_input('OpenAI model', 'gpt-4o-mini')
    st.caption('Set HUGGINGFACEHUB_API_TOKEN, GEMINI_API_KEY, OPENAI_API_KEY in secrets/env.')

tab1, tab2, tab3 = st.tabs(["Image Captioning", "Visual Q&A", "Audio (TTS/STT)"])

with tab1:
    st.subheader('Image Captioning (HF Inference API)')
    img_file = st.file_uploader('Upload image', type=['png','jpg','jpeg','webp'])
    model_name = st.text_input('Captioning model', value='nlpconnect/vit-gpt2-image-captioning')
    if st.button('Caption image') and img_file is not None:
        try:
            hf = get_hf_client()
            cap = hf.image_to_text(model=model_name, image=img_file)
            st.image(Image.open(img_file), caption='Uploaded image', use_column_width=True)
            st.markdown(f"**Caption:** {cap.generated_text if hasattr(cap,'generated_text') else cap}")
        except Exception as e:
            st.error(f"Captioning failed: {e}")

with tab2:
    st.subheader('Visual Question Answering (VQA)')
    vqa_img = st.file_uploader('Upload image for VQA', type=['png','jpg','jpeg','webp'], key='vqa')
    question = st.text_input('Your question about the image')
    if st.button('Answer question') and vqa_img is not None and question.strip():
        try:
            if vqa_provider=='Gemini':
                g = get_gemini_model(vqa_model)
                # Gemini expects bytes; provide as part of content
                img_bytes = vqa_img.read()
                out = g.generate_content([
                    question,
                    { 'mime_type': vqa_img.type or 'image/png', 'data': img_bytes }
                ])
                ans = getattr(out,'text',str(out))
            else:
                client = get_openai_client()
                # OpenAI vision via image_url is standard; for local, upload or use data URL (not supported here). We'll use a placeholder: show message.
                st.info('OpenAI vision typically uses image URLs or base64; uploading to a temporary URL is required. Using Gemini path is recommended for local files.')
                ans = 'Please use Gemini tab for local file VQA, or host the image and pass the URL to OpenAI.'
            st.image(Image.open(io.BytesIO(img_bytes if vqa_provider=='Gemini' else vqa_img.read())), use_column_width=True)
            st.markdown(f"**Answer:** {ans}")
        except Exception as e:
            st.error(f"VQA failed: {e}")

with tab3:
    st.subheader('Audio ‚Äî Text to Speech (TTS) and Speech to Text (STT)')
    st.markdown('- TTS/STT can incur costs and require additional system libs. Examples are shown commented for safety.')
    st.code(
        """
# OpenAI TTS (commented)
# from openai import OpenAI
# client = OpenAI()
# audio = client.audio.speech.create(model='gpt-4o-mini-tts', voice='alloy', input='Hello world')
# with open('tts.wav','wb') as f: f.write(audio.read())

# OpenAI STT (commented)
# with open('audio.wav','rb') as f:
#     text = client.audio.transcriptions.create(model='gpt-4o-transcribe', file=f)
#     print(text.text)
        """,
        language='python'
    )


# Notes
# - HF image captioning requires an access token and model availability.
# - Gemini supports direct byte uploads in prompts; OpenAI generally expects URLs for images.
# - For production, use proper file storage and pass URLs to models; handle safety/size limits.