# MSDS 696: Dog Detection

## Rogelio B Delgado

### Spring II 2025

Below are the important libraries necessary to perform this project.

In [None]:
# pip install ultralytics

In [None]:
import cv2
import numpy as np
import matplotlib.pyplot as plt
import time
import ipywidgets as widgets
from IPython.display import display
from IPython.display import Image
import torch

Here I set the minimum confidence interval to be at .70 or 70%. This means that while the YOLO model is running the bounding box and confidence score will only populate if the model believes the object is a dog.

In [None]:
min_confidence = 0.70

In [None]:
from ultralytics import YOLO

Gathered from ultralytics.com, dog pose dataset.
https://www.ultralytics.com/blog/custom-training-ultralytics-yolo11-for-dog-pose-estimation

I have already ran the model and saved in onto my desktop as best.pt. The traning was done on google code lab because of this computing power, compared to my own cpu.

In [None]:
# load model
#model = YOLO("yolo11n-pose.pt")

# Train the model
#results = model.train(data="dog-pose.yaml", epochs=100, imgsz=640)

In [None]:
model = YOLO('best.pt')

Below is a simple test to ensure that the model was uploaded and trained. The iamge was resized to have a visually appearing picture. The confidnece score was 90%.

 



In [None]:
# Testing image 1
results = model('sit_1.jpg')

image_sit1 = results[0].plot()

resized_image = cv2.resize(image_sit1, (800, 800))

resized_image_rgb = cv2.cvtColor(resized_image, cv2.COLOR_BGR2RGB)

plt.imshow(resized_image_rgb)
plt.axis('off')
plt.show()

Image Test 2

In [None]:
# Testing image 2
results1 = model('sit_3.jpg')

image_sit_3 = results1[0].plot()

resized_image1 = cv2.resize(image_sit_3, (800, 800))

resized_image_rgb1 = cv2.cvtColor(resized_image1, cv2.COLOR_BGR2RGB)

plt.imshow(resized_image_rgb1)
plt.axis('off')
plt.show()

Video 1: Atlas Running through the snow in the backyard.

In [None]:
video_1 = r'video_1.mp4'
cap = cv2.VideoCapture(video_1)



img_widget = widgets.Image(format='jpeg', width=800, height=800)
display(img_widget)

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    results = model(frame)
    annotated_frame = results[0].plot()
    
    rotated_frame = cv2.rotate(annotated_frame, cv2.ROTATE_90_CLOCKWISE)

    resized_frame = cv2.resize(rotated_frame, (600, 600))

    ret2, jpeg = cv2.imencode('.jpg', resized_frame)
    if ret2:
        img_widget.value = jpeg.tobytes()
    
    time.sleep(0.0001)

cap.release()


Video of a herd of deer near my workplace. This is an example of how four legged animals were identified as dogs. This is due to the deer having matching points to a dog. Legs, tail, head, torso, ect. The YOLO model is not defined to identify dogs but rather poses. The problem is that all of the training images are that of dogs.

Instead my project shifted to focus on a specific point, outside my porch door. I only have one dog and do not have other dogs in my backyard from familiy or friends. Thus, no need to identify my own dog from others.

The model did an accurate job by keeping the bounding box and the confidence score of .70.

In [None]:
video_2 = r'Atlas_1.mp4'
cap = cv2.VideoCapture(video_2)

img_widget = widgets.Image(format='jpeg', width=800, height=800)
display(img_widget)

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    results = model(frame, conf=min_confidence)
    annotated_frame = results[0].plot()
    
    rotated_frame = cv2.rotate(annotated_frame, cv2.ROTATE_90_CLOCKWISE)

    resized_frame = cv2.resize(rotated_frame, (600, 600))

    ret2, jpeg = cv2.imencode('.jpg', resized_frame)
    if ret2:
        img_widget.value = jpeg.tobytes()
    
    time.sleep(0.0001)

cap.release()

In [None]:
video_2 = r"Atlas_1.mp4"
cap = cv2.VideoCapture(video_2)
min_confidence = 0.5

window_name = "Atlas Pose"
cv2.namedWindow(window_name, cv2.WINDOW_NORMAL)
cv2.resizeWindow(window_name, 800, 800)

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    results = model(frame, conf=min_confidence)[0]
    annotated = results.plot()

    rotated = cv2.rotate(annotated, cv2.ROTATE_90_CLOCKWISE)
    display_frame = cv2.resize(rotated, (600, 600))

    cv2.imshow(window_name, display_frame)

    if cv2.waitKey(1) & 0xFF == ord("q"):
        break

cap.release()
cv2.destroyAllWindows()


new code attempt at normal speed

In [None]:
model = YOLO("yolov8n.pt")
device = "cuda" if torch.cuda.is_available() else "cpu"
min_confidence = 0.5


cap = cv2.VideoCapture(r"Atlas_1.mp4")
fps = cap.get(cv2.CAP_PROP_FPS) or 30
frame_interval = 1.0 / fps


downscale_size  = (480, 270)
display_size    = (600, 600)
process_every_n = 3

frame_count = 0

while cap.isOpened():
    start_time = time.time()
    ret, frame = cap.read()
    if not ret:
        break


    frame = cv2.rotate(frame, cv2.ROTATE_90_CLOCKWISE)

    frame_count += 1
    if frame_count % process_every_n != 0:
        display_frame = cv2.resize(frame, display_size)
    else:

        small = cv2.resize(frame, downscale_size)


        results = model(small, device=device, conf=min_confidence)[0]


        for box in results.boxes:
            x1, y1, x2, y2 = map(int, box.xyxy[0].tolist())
            conf = box.conf[0].item()

            cv2.rectangle(small, (x1, y1), (x2, y2), (0, 255, 0), 2)

            label = f"Atlas {conf:.2f}"
            cv2.putText(
                small, label, (x1, y1 - 10),
                cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2
            )

        display_frame = cv2.resize(small, display_size)

    cv2.imshow("Atlas Pose Estimation", display_frame)


    elapsed = time.time() - start_time
    wait_ms = max(int((frame_interval - elapsed) * 1000), 1)
    if cv2.waitKey(wait_ms) & 0xFF == ord("q"):
        break

cap.release()
cv2.destroyAllWindows()


In [None]:
import cv2
import numpy as np
from ultralytics import YOLO

# 1) Load your one‐class pose model
pose_model = YOLO("best.pt")

# 2) Open the video
cap = cv2.VideoCapture("Atlas_1.mp4")
if not cap.isOpened():
    raise RuntimeError("Could not open video")

# 3) Create a resizable window (we’ll handle scaling ourselves)
window_name = "Atlas Pose"
cv2.namedWindow(window_name, cv2.WINDOW_NORMAL)

# 4) Thresholds
leg_dist_threshold = 50   # front paws closer than this => Sitting
vertical_threshold = 30   # for deciding Laying vs Unknown

# 5) Skeleton pairs (optional drawing)
skeleton_pairs = [(0,5),(0,6),(5,11),(6,12),(11,15),(12,15)]

# 6) Max display size to prevent OS clipping
MAX_W, MAX_H = 800, 800

while True:
    ret, frame = cap.read()
    if not ret:
        break

    # 7) Copy for annotation
    annotated = frame.copy()
    h, w = annotated.shape[:2]

    res = pose_model(frame)[0]
    if res.keypoints is not None and len(res.keypoints.xy) > 0:
        kp = res.keypoints.xy.numpy()[0]  # (N,2)

        # compute front paws distance
        if np.all(kp[[5,6],:] > 0):
            leg_dist = np.linalg.norm(kp[5] - kp[6])
        else:
            leg_dist = None

        front_y = np.mean(kp[[5,6],1]) if np.all(kp[[5,6],:] > 0) else None
        back_y  = np.mean(kp[[11,12],1]) if np.all(kp[[11,12],:] > 0) else None

        # poses logic
        if leg_dist is not None and leg_dist < leg_dist_threshold:
            pose = "Sitting"
        elif front_y and back_y and abs(back_y - front_y) < vertical_threshold:
            pose = "Laying Down"
        else:
            pose = "Unknown"

        x1, y1 = kp.min(axis=0).astype(int)
        x2, y2 = kp.max(axis=0).astype(int)
        cv2.rectangle(annotated, (x1,y1), (x2,y2), (0,255,0), 2)

        # Since top of video is cut off, adjust the label position
        label_y = min(y2 + 25, h - 10)
        cv2.putText(
            annotated, pose,
            (x1, label_y),
            cv2.FONT_HERSHEY_SIMPLEX, 1.2,
            (0,255,0), 2
        )

        for (x,y) in kp:
            if x>0 and y>0:
                cv2.circle(annotated, (int(x),int(y)), 4, (0,0,255), -1)
        for a,b in skeleton_pairs:
            p1, p2 = kp[a], kp[b]
            if np.all(p1>0) and np.all(p2>0):
                cv2.line(
                    annotated,
                    tuple(p1.astype(int)),
                    tuple(p2.astype(int)),
                    (255,0,0), 2
                )


    scale = min(MAX_W / w, MAX_H / h, 1.0)
    disp = cv2.resize(annotated, (int(w * scale), int(h * scale)))


    cv2.imshow(window_name, disp)
    if cv2.waitKey(1) & 0xFF == ord("q"):
        break

cap.release()
cv2.destroyAllWindows()
