## Moondream2 BFloat16

In [None]:
!pip install pyvips

In [2]:
import time
import cv2

import matplotlib.pyplot as plt 
%matplotlib inline
plt.rcParams['image.cmap'] = 'gray'

from transformers import AutoModelForCausalLM, AutoTokenizer
from PIL import Image, ImageDraw

### Check List of Models

In [None]:
from huggingface_hub import list_models

# List models by user "vikhyatk"
models_user = list_models(author="vikhyatk")
print("Models by vikhyatk:")
for m in models_user:
    print("-", m.modelId)

# List models under org "moondream"
models_org = list_models(author="moondream")
print("\nModels under 'moondream' org:")
for m in models_org:
    print("-", m.modelId)

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    "vikhyatk/moondream2",
    revision="2025-06-21",
    trust_remote_code=True,
    device_map="auto",
)

In [None]:
dtype = next(model.parameters()).dtype
print(dtype)

In [3]:
model2 = AutoModelForCausalLM.from_pretrained(
    "moondream/moondream-2b-2025-04-14",
    # revision="2025-06-21",
    trust_remote_code=True,
    device_map="auto", 
)

In [4]:
dtype = next(model2.parameters()).dtype
print(dtype)

torch.float16


In [None]:
print(model)

### 1. Short Caption

In [None]:
img = Image.open("../tasks/bird.jpg")
print('Short caption:')
t1 = time.time()
print(model.caption(img, length="short")["caption"])
t2 = time.time()
diff = t2 - t1
print(f"Total time taken : {round(diff,2)}")

### 2. Normal Caption

In [None]:
print('Normal caption:')
t1 = time.time()
normal_caption = model.caption(img, length="normal")["caption"]

for t in normal_caption:
    print(t, end="", flush=True)
t2 = time.time()
diff = t2 - t1
print(f"\Caption Time : {round(diff,2)}")

### 3. Long Caption

In [None]:
print('Long caption:')
t1 = time.time()
long_caption = model.caption(img, length="long")["caption"]

for t in long_caption:
    print(t, end="", flush=True)
    
t2 = time.time()
diff = t2 - t1
print(f"\nCaption Time : {round(diff,2)}")

### 4. VQA using MoonDream - Example 1

In [None]:
# Visual Querying
qimg = Image.open('../tasks/potholes.png')
print("\nVisual query: 'How many potholes are there in the image?'")
print(model.query(qimg, "How many potholes are there in the image?")["answer"])

### 5. VQA using MoonDream - Example 2

In [None]:
# Visual Querying
qimg = Image.open('../tasks/cable-trip.jpg')
print("\nVisual query: 'Why is the person falling?'")
print(model.query(qimg, "Why is the person falling?")["answer"])

### 6. Object Detection

In [None]:
# Object Detection
imgf = Image.open('../tasks/driving-gaze.jpg')
print("\nObject detection: 'face'")
objects = model.detect(imgf, "face")["objects"]
print(f"Found {len(objects)} face(s)")
w, h = imgf.size

In [None]:
for bbox in objects:
    # Convert normalized to pixel coords
    x_min = int(bbox['x_min'] * w)
    y_min = int(bbox['y_min'] * h)
    x_max = int(bbox['x_max'] * w)
    y_max = int(bbox['y_max'] * h)

print(x_min)

In [None]:
# Create draw object
draw = ImageDraw.Draw(imgf)
# Loop over bboxes
for bbox in objects:
    # Convert normalized to pixel coords
    x_min = int(bbox['x_min'] * w)
    y_min = int(bbox['y_min'] * h)
    x_max = int(bbox['x_max'] * w)
    y_max = int(bbox['y_max'] * h)

    # Draw rectangle (outline only)
    draw.rectangle([x_min, y_min, x_max, y_max], outline="green", width=3)

    # Optionally add text
    draw.text((x_min, y_min - 15), "Face", fill="green")

plt.figure(figsize = [20, 8])
plt.subplot(121); plt.imshow(imgf); plt.title('Person Face')

### 7. Pointing a Person

In [None]:
# Pointing
print("\nPointing: 'person'")
points = model.point(image, "person")["points"]
print(f"Found {len(points)} person(s)")