In [1]:
!pip install fastapi uvicorn pillow transformers accelerate bitsandbytes pyngrok nest_asyncio python-multipart

Collecting bitsandbytes
  Downloading bitsandbytes-0.49.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting pyngrok
  Downloading pyngrok-7.5.0-py3-none-any.whl.metadata (8.1 kB)
Downloading bitsandbytes-0.49.1-py3-none-manylinux_2_24_x86_64.whl (59.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyngrok-7.5.0-py3-none-any.whl (24 kB)
Installing collected packages: pyngrok, bitsandbytes
Successfully installed bitsandbytes-0.49.1 pyngrok-7.5.0


In [2]:
from fastapi import FastAPI, UploadFile, File, Form
from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
import torch
from PIL import Image
import io

app = FastAPI()

MODEL_ID = "Qwen/Qwen2.5-VL-3B-Instruct"

print("⏳ Loading Model... This might take 2 minutes...")
processor = AutoProcessor.from_pretrained(MODEL_ID)
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.float16,
    device_map="auto",
)
model.eval()
print("✅ Model Loaded Successfully!")

PROMPTS = {
    "text": "Describe this image in detail. If it contains handwritten notes, transcribe them clearly.",

    "table": "Analyze this image. It contains a data table. Extract all rows and columns and output them strictly as a Markdown table. Do not add conversational text.",

    "diagram": "Analyze this image. It is a diagram or flowchart. Identify nodes and relationships. Output the structure strictly using Mermaid.js graph syntax. Do not add conversational text.",

    "auto": """Analyze this image and determine the best structured output.
    - If it is a Table, output a Markdown Table.
    - If it is a Diagram/Flowchart, output Mermaid.js code.
    - If it is text notes, output a Markdown List.
    Output ONLY the raw code/markdown. Do not add conversational text."""
}

@app.post("/analyze")
async def analyze_image(
    file: UploadFile = File(...),
    task_type: str = Form("auto")
):
    try:
        image_bytes = await file.read()
        image = Image.open(io.BytesIO(image_bytes)).convert("RGB")

        image.thumbnail((1280, 1280))

        system_prompt = PROMPTS.get(task_type, PROMPTS["auto"])

        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "image", "image": image},
                    {"type": "text", "text": system_prompt},
                ],
            }
        ]

        text_input = processor.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )

        inputs = processor(
            text=[text_input],
            images=[image],
            padding=True,
            return_tensors="pt",
        ).to(model.device)

        with torch.no_grad():
            output_ids = model.generate(**inputs, max_new_tokens=1024)

        generated_ids = [
            output_ids[len(input_ids):]
            for input_ids, output_ids in zip(inputs.input_ids, output_ids)
        ]
        description = processor.batch_decode(
            generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True
        )[0]

        return {
            "status": "success",
            "task_type": task_type,
            "output": description
        }

    except Exception as e:
        print(f"❌ ERROR: {repr(e)}")
        return {"status": "error", "message": str(e)}

@app.get("/health")
async def health_check():
    return {"status": "running", "model": MODEL_ID}

⏳ Loading Model... This might take 2 minutes...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

chat_template.json: 0.00B [00:00, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


config.json: 0.00B [00:00, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.53G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/216 [00:00<?, ?B/s]

✅ Model Loaded Successfully!


In [3]:
import nest_asyncio
nest_asyncio.apply()

import uvicorn
from pyngrok import ngrok
from google.colab import userdata

# Load ngrok token from secrets
ngrok_token = userdata.get("NGROK_AUTH_TOKEN")
ngrok.set_auth_token(ngrok_token)

# Open tunnel
public_url = ngrok.connect(8000)
print("Public URL:", public_url)

# Start server using current event loop
config = uvicorn.Config(app, host="0.0.0.0", port=8000, loop="asyncio")
server = uvicorn.Server(config)

await server.serve()

Public URL: NgrokTunnel: "https://1274221127a2.ngrok-free.app" -> "http://localhost:8000"


INFO:     Started server process [3430]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)


INFO:     122.184.65.228:0 - "GET / HTTP/1.1" 404 Not Found
INFO:     122.184.65.228:0 - "GET /favicon.ico HTTP/1.1" 404 Not Found
INFO:     122.184.65.228:0 - "GET /health HTTP/1.1" 200 OK
INFO:     122.184.65.228:0 - "GET /health HTTP/1.1" 200 OK
INFO:     122.184.65.228:0 - "POST /analyze HTTP/1.1" 200 OK
INFO:     122.184.65.228:0 - "GET /health HTTP/1.1" 200 OK
INFO:     122.184.65.228:0 - "POST /analyze HTTP/1.1" 200 OK
INFO:     122.184.65.228:0 - "GET /health HTTP/1.1" 200 OK
INFO:     122.184.65.228:0 - "POST /analyze HTTP/1.1" 200 OK
INFO:     122.184.65.228:0 - "GET /health HTTP/1.1" 200 OK
INFO:     122.184.65.228:0 - "POST /analyze HTTP/1.1" 200 OK


INFO:     Shutting down
INFO:     Waiting for application shutdown.
INFO:     Application shutdown complete.
INFO:     Finished server process [3430]
