In [18]:
import torch
import torchvision
from torchvision import transforms
import cv2
import numpy as np
import matplotlib.pyplot as plt

# Load pre-trained SSD model
model = torchvision.models.detection.ssd300_vgg16(pretrained=True)
model.eval()

# Define COCO class names
COCO_INSTANCE_CATEGORY_NAMES = [
    '__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
    'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign',
    'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
    'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag',
    'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite',
    'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket',
    'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana',
    'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza',
    'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'dining table',
    'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone',
    'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock',
    'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush'
]

# Define a transformation to preprocess the input image
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((300, 300)),
    transforms.ToTensor()
])




In [15]:
def predict(image, model, transform, threshold=0.5):
    # Apply transformations
    image_transformed = transform(image)
    image_transformed = image_transformed.unsqueeze(0)  # Add batch dimension

    # Perform inference
    with torch.no_grad():
        predictions = model(image_transformed)

    pred_boxes = predictions[0]['boxes'].cpu().numpy()
    pred_scores = predictions[0]['scores'].cpu().numpy()
    pred_labels = predictions[0]['labels'].cpu().numpy()

    print("pred_boxes",pred_boxes)
    # Filter out predictions with low scores
    pred_boxes = pred_boxes[pred_scores >= threshold]
    pred_labels = pred_labels[pred_scores >= threshold]
    pred_scores = pred_scores[pred_scores >= threshold]

    return pred_boxes, pred_labels, pred_scores

def draw_predictions(image, boxes, labels, scores, category_names):
    for box, label, score in zip(boxes, labels, scores):
        xmin, ymin, xmax, ymax = box
        cv2.rectangle(image, (int(xmin), int(ymin)), (int(xmax), int(ymax)), (0, 255, 0), 2)
        label_text = f"{category_names[label]}: {score:.2f}"
        cv2.putText(image, label_text, (int(xmin), int(ymin) - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

    return image

# Load an image
image_path = 'dog.jpg'
image = cv2.imread(image_path)
image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

# Run prediction
boxes, labels, scores = predict(image_rgb, model, transform, threshold=0.5)
print("boxesboxes",boxes)
# Draw predictions on the image
image_with_boxes = draw_predictions(image, boxes, labels, scores, COCO_INSTANCE_CATEGORY_NAMES)

# Display the image
cv2.imshow('Image with Boxes', image_with_boxes)
cv2.waitKey(0)
cv2.destroyAllWindows()


pred_boxes [[5.13689919e+01 1.18488861e+02 1.24388184e+02 2.82524719e+02]
 [5.74255905e+01 6.23866348e+01 2.26933838e+02 2.33915009e+02]
 [1.85251297e+02 4.05267220e+01 2.68781281e+02 8.84308472e+01]
 [5.24272232e+01 7.31441345e+01 9.87501450e+01 1.12197372e+02]
 [1.84690735e+02 4.02561646e+01 2.69746399e+02 8.84116516e+01]
 [4.83782043e+01 8.75227051e+01 1.44665833e+02 2.80105988e+02]
 [3.20394135e+01 2.88523560e+01 1.88672684e+02 1.74532761e+02]
 [2.29861176e+02 5.31240692e+01 2.70084503e+02 8.85816498e+01]
 [5.12976151e+01 6.95824814e+01 1.23398308e+02 1.28194458e+02]
 [2.45803719e+01 3.94134750e+01 4.16341133e+01 6.44665298e+01]
 [1.83590714e+02 4.21247406e+01 2.27649582e+02 7.98392563e+01]
 [4.91693459e+01 1.20075005e+02 1.21503387e+02 2.82158386e+02]
 [3.92272034e+01 5.41129456e+01 1.71166367e+02 1.24431580e+02]
 [1.14405441e+02 1.01780319e+02 2.20862869e+02 2.21421158e+02]
 [2.59190460e+02 4.85221100e+01 2.94981171e+02 9.53119507e+01]
 [4.54269905e+01 6.94272995e+01 1.25854553e+

In [28]:
import torch
import torchvision
from torchvision import transforms
import cv2
import numpy as np

# Load pre-trained SSD model
model = torchvision.models.detection.ssd300_vgg16(pretrained=True)
model.eval()

# Define COCO class names
COCO_INSTANCE_CATEGORY_NAMES = [
    '__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
    'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign',
    'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
    'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag',
    'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite',
    'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket',
    'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana',
    'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza',
    'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'dining table',
    'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone',
    'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock',
    'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush'
]

# Define a transformation to preprocess the input image
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((716, 716)),  # Ensure the image size is 300x300
    transforms.ToTensor()
])

def predict(image, model, transform, threshold=0.5):
    # Apply transformations
    image_transformed = transform(image)
    image_transformed = image_transformed.unsqueeze(0)  # Add batch dimension

    # Perform inference
    with torch.no_grad():
        predictions = model(image_transformed)

    pred_boxes = predictions[0]['boxes'].cpu().numpy()
    pred_scores = predictions[0]['scores'].cpu().numpy()
    pred_labels = predictions[0]['labels'].cpu().numpy()

    # Filter out predictions with low scores
    pred_boxes = pred_boxes[pred_scores >= threshold]
    pred_labels = pred_labels[pred_scores >= threshold]
    pred_scores = pred_scores[pred_scores >= threshold]

    return pred_boxes, pred_labels, pred_scores

def draw_predictions(image, boxes, labels, scores, category_names):
    for box, label, score in zip(boxes, labels, scores):
        xmin, ymin, xmax, ymax = box
        cv2.rectangle(image, (int(xmin), int(ymin)), (int(xmax), int(ymax)), (0, 255, 0), 2)
        label_text = f"{category_names[label]}: {score:.2f}"
        cv2.putText(image, label_text, (int(xmin), int(ymin) - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

    return image

# Load an image
image_path = 'dog.jpg'
image = cv2.imread(image_path)
image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

# Run prediction
boxes, labels, scores = predict(image_rgb, model, transform, threshold=0.5)
print("boxesboxes",boxes)
# Draw predictions on the image
image_with_boxes = draw_predictions(image, boxes, labels, scores, COCO_INSTANCE_CATEGORY_NAMES)

# Display the image
cv2.imshow('Image with Boxes', image_with_boxes)
cv2.waitKey(0)
cv2.destroyAllWindows()


boxesboxes [[122.78454  280.92355  295.2294   674.99615 ]
 [133.39934  147.08163  540.4892   563.2898  ]
 [442.4427    97.084175 642.19434  210.61925 ]]
