In [None]:
!pip install -q git+https://github.com/openai/whisper.git
!pip install -q gradio
!pip install -q gTTS
!pip install -q -U transformers==4.37.2
!pip install -q bitsandbytes==0.41.3 accelerate==0.25.0
!pip install -q git+https://github.com/openai/whisper.git


## For Chat With Image

In [None]:
import gradio as gr
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
from gtts import gTTS
import torch
from PIL import Image
import speech_recognition as sr

# Load model and processor
model_name = "Qwen/Qwen2-VL-2B-Instruct"
model = Qwen2VLForConditionalGeneration.from_pretrained(
    model_name, torch_dtype=torch.bfloat16, attn_implementation="eager", device_map="auto"
)
processor = AutoProcessor.from_pretrained(model_name)

# Function to process audio to text
def transcribe_audio(audio_path):
    recognizer = sr.Recognizer()
    with sr.AudioFile(audio_path) as source:
        audio_data = recognizer.record(source)
    try:
        return recognizer.recognize_google(audio_data)
    except sr.UnknownValueError:
        return "Could not understand the audio."
    except sr.RequestError as e:
        return f"Speech recognition error: {e}"

# Function to process image and query
def analyze_image(image_path, text_query, audio_query_path):
    # Handle voice input if provided
    if audio_query_path:
        processed_query = transcribe_audio(audio_query_path)
    else:
        processed_query = text_query

    if not image_path or not processed_query.strip():
        return "Please provide both an image and a query.", None

    # Load and process the image
    pil_image = Image.open(image_path)
    messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": "image",
                    "image": "<insert_image_path_or_url_here>",
                    "max_pixels": 360 * 420,
                    "fps": 1.0,
                },
                {
                    "type": "text",
                    "text": processed_query,
                },
            ],
        }
    ]
    text_prompt = processor.apply_chat_template(messages, add_generation_prompt=True)

    inputs = processor(
        text=[text_prompt],
        images=[pil_image],
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    output_ids = model.generate(**inputs, max_new_tokens=1024)

    generated_ids = [
        output_ids[len(input_ids):]
        for input_ids, output_ids in zip(inputs.input_ids, output_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True
    )[0]

    # Convert text to speech
    tts = gTTS(output_text, lang="en")
    audio_file = "output_audio.mp3"
    tts.save(audio_file)

    return output_text, audio_file

# Define Gradio interface
with gr.Blocks() as app:
    gr.Markdown("## Image Analyzer with Voice Assistant")

    with gr.Row():
        image_input = gr.Image(type="filepath", label="Upload Image")
        query_input = gr.Textbox(label="Type Your Query", placeholder="Enter your query here...")
        voice_input = gr.Audio(type="filepath", label="Or Record Your Query")

    analyze_button = gr.Button("Analyze")

    with gr.Row():
        output_text = gr.Textbox(label="Analysis Result", interactive=False)
        output_audio = gr.Audio(label="Audio Response", interactive=False)

    analyze_button.click(
        fn=analyze_image,
        inputs=[image_input, query_input, voice_input],
        outputs=[output_text, output_audio]
    )

# Launch the app
app.launch()
