In [None]:
!pip install vllm
!pip install --upgrade mistral_common

In [None]:
from vllm import LLM
from vllm.sampling_params import SamplingParams
from dotenv import load_dotenv
import os
import gradio as gr
from pydantic import BaseModel

In [None]:
load_dotenv()

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
llm=LLM(
    model="mistral-community/pixtral-12b-240910",
    tokenizer_mode="mistral",
    max_model_len=5000
)

In [None]:
class information(BaseModel):
    heading:str
    content:str

In [None]:
def generate_context(image_url, prompt = "Extract text from the image and give the response in JSON format"):
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": prompt},
                {"type": "image_url", "image_url": {"url": image_url}}
            ]
        }
    ]

    outputs = llm.chat(
        messages,
        sampling_params=SamplingParams(max_tokens=8192)
    )

    return outputs[0].outputs[0].text


In [None]:
def query_llm(context,query):
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": "You are an answer generation agent, you'll be given context and query, generate answer in human readable form"},
                {"type": "text", "text": f"here is the question {query} and here is the context {context}"}
            ]
        }
    ]

    outputs = llm.chat(
        messages,
        sampling_params=SamplingParams(max_tokens=8192)
    )

    return outputs[0].outputs[0].text

In [None]:
import gradio as gr

def process_query(url, query):
    context = generate_context(url)
    response = query_llm(context, query)
    return response, context

if __name__ == "__main__":
    # Create the Gradio interface
    interface = gr.Interface(
        fn=process_query,
        inputs=[
            gr.Textbox(label="Enter the URL", placeholder="Enter image URL here"),
            gr.Textbox(label="Enter your query", placeholder="Ask a question about the content")
        ],
        outputs=[
            gr.Textbox(label="Response"),
            gr.Textbox(label="Json Parsed Data"),
        ],
        title="Pixtral-12b RAG Application",
        description="Provide an image URL and ask questions based on the context generated from it."
    )

    # Launch the interface
    interface.launch(share = True)
