In [1]:
import requests

import torch
from PIL import Image
from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection 
device = "cuda" if torch.cuda.is_available() else "cpu"
print("device: ", device)

model_id = "IDEA-Research/grounding-dino-tiny"

processor = AutoProcessor.from_pretrained(model_id)
model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id).to(device)

image_url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Check for cats and remote controls
text = "a cat. a remote control."

inputs = processor(images=image, text=text, return_tensors="pt").to(device)
with torch.no_grad():
    outputs = model(**inputs)

results = processor.post_process_grounded_object_detection(
    outputs,
    inputs.input_ids,
    box_threshold=0.4,
    text_threshold=0.3,
    target_sizes=[image.size[::-1]]
)

print(results)

device:  cuda
[{'scores': tensor([0.4785, 0.4379, 0.4761], device='cuda:0'), 'labels': ['a cat', 'a cat', 'a remote control'], 'boxes': tensor([[344.6980,  23.1083, 637.1817, 374.2748],
        [ 12.2695,  51.9101, 316.8565, 472.4348],
        [ 38.5854,  70.0091, 176.7766, 118.1754]], device='cuda:0')}]


In [2]:
results

[{'scores': tensor([0.4785, 0.4379, 0.4761], device='cuda:0'),
  'labels': ['a cat', 'a cat', 'a remote control'],
  'boxes': tensor([[344.6980,  23.1083, 637.1817, 374.2748],
          [ 12.2695,  51.9101, 316.8565, 472.4348],
          [ 38.5854,  70.0091, 176.7766, 118.1754]], device='cuda:0')}]

In [2]:
import torch
from PIL import Image
from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection 

device = "cuda" if torch.cuda.is_available() else "cpu"
print("device: ", device)

model_id = "IDEA-Research/grounding-dino-base"

processor = AutoProcessor.from_pretrained(model_id)
model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id).to(device)

# Replace this with your local image path
# image_path = "/path/to/your/image.jpg"
image_path = "/home/ammara/Documents/helper_code/extracted_frames/movie_3/frame_0.jpg"
image = Image.open(image_path)

# Check for cats and remote controls
text = "person. golf stick. door."

inputs = processor(images=image, text=text, return_tensors="pt").to(device)
with torch.no_grad():
    outputs = model(**inputs)

results = processor.post_process_grounded_object_detection(
    outputs,
    inputs.input_ids,
    box_threshold=0.4,
    text_threshold=0.3,
    target_sizes=[image.size[::-1]]
)

print(results)

device:  cuda


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

[{'scores': tensor([0.7616, 0.6544, 0.4237], device='cuda:0'), 'labels': ['person', 'golf stick', 'door'], 'boxes': tensor([[356.5520,  27.0828, 718.6262, 738.6205],
        [361.1831, 374.3941, 508.5401, 680.1787],
        [331.1195,   1.9880, 715.5774, 678.6860]], device='cuda:0')}]


In [5]:
import torch
from PIL import Image
import numpy as np

# Assuming you have the original image
original_image = Image.open("/home/ammara/Documents/helper_code/extracted_frames/movie_3/frame_0.jpg")

# Recreating the results as a Python dictionary with torch tensors
results = [{'scores': torch.tensor([0.7616, 0.6544, 0.4237], device='cuda:0'), 'labels': ['person', 'golf stick', 'door'], 'boxes': torch.tensor([[356.5520,  27.0828, 718.6262, 738.6205],
        [361.1831, 374.3941, 508.5401, 680.1787],
        [331.1195,   1.9880, 715.5774, 678.6860]], device='cuda:0')}]

# Function to crop image based on bounding box
def crop_image(image, box):
    return image.crop(box)

# Find all 'golf stick' boxes
golf_stick_boxes = [box.cpu().numpy() for box, label in zip(results[0]['boxes'], results[0]['labels']) if label == 'golf stick']

# Crop images for each golf stick box
cropped_images = []
for i, box in enumerate(golf_stick_boxes):
    # Convert box coordinates to integers
    box = [int(coord) for coord in box]
    cropped_image = crop_image(original_image, box)
    cropped_images.append(cropped_image)
    
    # Save the cropped image
    cropped_image.save(f"golf_stick_{i+1}.jpg")

print(f"Cropped {len(cropped_images)} golf stick images.")

Cropped 1 golf stick images.
