Accessing the webcam

In [None]:
%pip install ultralytics opencv-python diffusers torch accelerate torch eventlet pillow pytz ultralytics supervision

In [None]:
from ultralytics import YOLO
from pathlib import Path
import numpy as np
import requests
from PIL import Image
from io import BytesIO
import diffusers
import torch
import json

model = YOLO("yolov8m-seg.pt")

pipe = diffusers.MarigoldDepthPipeline.from_pretrained(
    "prs-eth/marigold-depth-lcm-v1-0", variant="fp16", torch_dtype=torch.float16
).to("cuda")

In [None]:
def process(image):
    model.overrides["conf"] = 0.25
    model.overrides["iou"] = 0.45
    res = model.predict(image, imgsz=(480, 640), show_boxes=True)

    depth = pipe(image)
    vis = pipe.image_processor.export_depth_to_16bit_png(depth.prediction)[0]

    vis_np = np.array(vis)

    object_coordinates = {}

    for r in res:
        img = np.copy(r.orig_img)
        if r.masks is not None:
          for ci, (box, mask) in enumerate(zip(r.boxes, r.masks.xy)):
              label = r.names[box.cls.tolist().pop()]

              unique_key = f"{label}_{ci}"

              bbox_coords = box.xyxy.tolist()

              object_coordinates[unique_key] = {
                  "bbox": np.array(bbox_coords, dtype=np.int32).tolist()
              }

              mask_coords = np.array(mask, dtype=np.int32)

              total_depth = 0
              pixel_count = 0

              val = 0

              for coord in mask_coords:
                  x, y = coord[0], coord[1]
                  if 0 <= x < vis_np.shape[1] and 0 <= y < vis_np.shape[0]:
                      val += vis_np[y, x]
                      pixel_count += 1

              if pixel_count > 0:
                  average_depth = val/pixel_count
              else:
                  average_depth = 0

              object_coordinates[unique_key]['depth'] = int(average_depth)

        else:
          print("Warning: No masks found in the detection results.")


    json_string = dict(sorted(object_coordinates.items(), key=lambda item: item[1]['depth']))
    return json_string

In [None]:
from IPython.display import display, Javascript, Image
from google.colab.output import eval_js
from base64 import b64decode, b64encode
import cv2
import numpy as np
import PIL
import io
import html
import time

Helper Function

In [None]:
def js_to_image(js_reply):
  """
  Params:
          js_reply: JavaScript object containing image from webcam
  Returns:
          img: OpenCV BGR image
  """
  image_bytes = b64decode(js_reply.split(',')[1])
  jpg_as_np = np.frombuffer(image_bytes, dtype=np.uint8)
  img = cv2.imdecode(jpg_as_np, flags=1)

  return img

def bbox_to_bytes(img):
    """
    Converts an OpenCV image (NumPy array) to a base64-encoded PNG string.

    Params:
        img: OpenCV BGR image (NumPy ndarray)

    Returns:
        str: Base64-encoded PNG image string
    """
    img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    pil_img = Image.fromarray(img_rgb)

    buffer = io.BytesIO()
    pil_img.save(buffer, format='PNG')
    buffer.seek(0)

    img_base64 = b64encode(buffer.getvalue()).decode('utf-8')

    img_base64_str = f"data:image/png;base64,{img_base64}"

    return img_base64_str


In [None]:
def video_stream():
  js = Javascript('''
    var video;
    var div = null;
    var stream;
    var captureCanvas;
    var imgElement;
    var labelElement;

    var pendingResolve = null;
    var shutdown = false;

    function removeDom() {
       stream.getVideoTracks()[0].stop();
       video.remove();
       div.remove();
       video = null;
       div = null;
       stream = null;
       imgElement = null;
       captureCanvas = null;
       labelElement = null;
    }

    function onAnimationFrame() {
      if (!shutdown) {
        window.requestAnimationFrame(onAnimationFrame);
      }
      if (pendingResolve) {
        var result = "";
        if (!shutdown) {
          captureCanvas.getContext('2d').drawImage(video, 0, 0, 640, 480);
          result = captureCanvas.toDataURL('image/jpeg', 0.8)
        }
        var lp = pendingResolve;
        pendingResolve = null;
        lp(result);
      }
    }

    async function createDom() {
      if (div !== null) {
        return stream;
      }

      div = document.createElement('div');
      div.style.border = '2px solid black';
      div.style.padding = '3px';
      div.style.width = '100%';
      div.style.maxWidth = '640px';
      document.body.appendChild(div);

      const modelOut = document.createElement('div');
      modelOut.innerHTML = "<span>Status:</span>";
      labelElement = document.createElement('span');
      labelElement.innerText = 'No data';
      labelElement.style.fontWeight = 'bold';
      modelOut.appendChild(labelElement);
      div.appendChild(modelOut);

      video = document.createElement('video');
      video.style.display = 'block';
      video.width = div.clientWidth - 6;
      video.setAttribute('playsinline', '');
      video.onclick = () => { shutdown = true; };
      stream = await navigator.mediaDevices.getUserMedia(
          {video: { facingMode: "environment"}});

      video.srcObject = stream;
      await video.play();

      captureCanvas = document.createElement('canvas');
      captureCanvas.width = 640; //video.videoWidth;
      captureCanvas.height = 480; //video.videoHeight;
      window.requestAnimationFrame(onAnimationFrame);

      imgElement = document.createElement('img');
      imgElement.onclick = () => { shutdown = true; };
      div.appendChild(imgElement);

      const instruction = document.createElement('div');
      instruction.innerHTML =
          '<span style="color: red; font-weight: bold;">' +
          'When finished, click here or on the video to stop this demo</span>';
      div.appendChild(instruction);
      instruction.onclick = () => { shutdown = true; };
      return stream;
    }
    async function stream_frame(label, imgData) {
      if (shutdown) {
        removeDom();
        shutdown = false;
        return '';
      }

      var preCreate = Date.now();
      stream = await createDom();

      var preShow = Date.now();
      if (label != "") {
        obj = label.split("/")[0]
        took = Math.floor((Date.now() - label.split("/")[1])/1000);
        labelElement.innerHTML = took + "sec ago " + obj;
      }

      if (imgData != "") {
        imgElement.src = imgData;
      }

      var preCapture = Date.now();
      var result = await new Promise(function(resolve, reject) {
        pendingResolve = resolve;
      });
      shutdown = false;

      return {'create': preShow - preCreate,
              'show': preCapture - preShow,
              'capture': Date.now() - preCapture,
              'sent': Date.now(),
              'img': result};
    }
    ''')

  display(js)

def video_frame(label, bbox):
  data = eval_js('stream_frame("{}", "{}")'.format(label, bbox))
  return data

In [None]:
import numpy as np
from PIL import Image, ImageDraw
import io
from base64 import b64encode
import cv2
import math

In [None]:
video_stream()
label_html = 'Capturing...'
bbox = ''
count = 0

font = cv2.FONT_HERSHEY_SIMPLEX
font_scale = 0.5
color = (255, 255, 255)
thickness = 2

while True:
    js_reply = video_frame(label_html, bbox)
    if not js_reply:
        break

    img = js_to_image(js_reply["img"])
    out = process(img)
    label_html = str(out) + "/" + str(js_reply["sent"])


    for key, value in out.items():
      x, y, p, q = value['bbox'][0]
      text = key + " [" + str(int(value['depth']/1000)) + "]"
      cv2.rectangle(img, (x, y), (p, q), color=(255, 0, 255), thickness=2)
      cv2.putText(img, text, (x, y-10), font, font_scale, color, thickness)

    bbox = bbox_to_bytes(img)


