In [1]:
!pip install --user opencv-python
!pip install --user ultralytics



In [2]:
from google.colab import drive

#!fusermount -u drive
drive.mount('/content/gdrive/', force_remount=True)
#!google-drive-ocamlfuse drive

Mounted at /content/gdrive/


In [3]:
import os
import cv2
import glob

import subprocess
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import IPython.display as ipd
from ultralytics import YOLO
from ultralytics.utils.plotting import Annotator

filepath = '/content/gdrive/MyDrive/Computer_Vision'

model = YOLO("yolov8m.pt")

Downloading https://github.com/ultralytics/assets/releases/download/v8.1.0/yolov8m.pt to 'yolov8m.pt'...


100%|██████████| 49.7M/49.7M [00:00<00:00, 161MB/s]


In [4]:
file = 'beach_run.mp4'
ipd.Video(f'{filepath}/{file}', width = 500, embed = True)

Output hidden; open in https://colab.research.google.com to view.

In [7]:
def detect_hand(result):

  for r in result:

    #annotator = Annotator(img)

    boxes = r.boxes
    for box in boxes:
      class_name = model.names[int(box.cls)]
      #print(f'Class: {class_name}, Confidence: {box.conf} Box: {box.xyxy[0]}')
      #annotator.box_label(b, model.names[int(c)])

      if class_name == 'hand':
        return True

    return False

  #img = annotator.result()
  #cv2.imshow('YOLO V8 Detection', img)



In [None]:
def calculate_distance( result, class_names, target_objects, object_size_dict):

  boxes = result.boxes

  for box in boxes:
      x1, y1, x2, y2 = box.xyxy[0]
      x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)

      cls = int(box.cls)

      if class_names[cls].lower() in target_objects:
          camera_width = x2 - x1
          distance = (real_width * frame_width) / camera_width
          #voice_notification(target_object)

          obj_center_x = (x1 + x2) // 2
          obj_center_y = (y1 + y2) // 2

          camera_middle_x = frame_width // 2
          camera_middle_y = frame_height // 2

          vector_x = obj_center_x - camera_middle_x
          vector_y = obj_center_y - camera_middle_y

          angle_deg = math.degrees(math.atan2(vector_y, vector_x))
          #direction = ''
          if angle_deg < 0:
              angle_deg += 360

          if 0 <= angle_deg < 30:
              direction = "3 o'clock"
          elif 30 <= angle_deg < 60:
              direction = "4 o'clock"
          elif 60 <= angle_deg < 90:
              direction = "5 o'clock"
          elif 90 <= angle_deg < 120:
              direction = "6 o'clock"
          elif 120 <= angle_deg < 150:
              direction = "7 o'clock"
          elif 150 <= angle_deg < 180:
              direction = "8 o'clock"
          elif 180 <= angle_deg < 210:
              direction = "9 o'clock"
          elif 210 <= angle_deg < 240:
              direction = "10 o'clock"
          elif 240 <= angle_deg < 270:
              direction = "11 o'clock"
          elif 270 <= angle_deg < 300:
              direction = "12 o'clock"
          elif 300 <= angle_deg < 330:
              direction = "1 o'clock"
          elif 330 <= angle_deg < 360:
              direction = "2 o'clock"
          else:
              direction = "Unknown Clock Position"

          cv2.putText(img, direction, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
          cv2.putText(img, "Distance: {:.2f} meters".format(distance), (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
          cv2.rectangle(img, (x1, y1), (x2, y2), (255, 0, 255), 3)

          if boxes is not None:

              voice_notification(class_names[cls].lower(), direction, distance)


In [None]:
import math
import speech_recognition as sr

def voice_command():
  recognizer = sr.Recognizer()

  with sr.Microphone() as source:
      print("Waiting for voice command...")
      recognizer.adjust_for_ambient_noise(source)
      audio = recognizer.listen(source)

  target_object = ""
  real_width = 0.15

  try:
      command = recognizer.recognize_google(audio, language="en-US")
      print("Recognized command:", command)
      last_word = get_last_word(command.lower())
      if last_word:
          print("Last word:", last_word)

      target_object = last_word.lower()

      if target_object in object_dimensions:
          real_width = float(object_dimensions[target_object])
          print(real_width)
      else:
          print(f"No length information found for {target_object}, using the default value of 0.15.")
  except sr.UnknownValueError:
      print("Voice cannot be understood.")
  except sr.RequestError as e:
      print("Voice recognition error; {0}".format(e))

  return target_object, real_width

def voice_notification(obj_name, direction, distance):
  engine = pyttsx3.init()
  text = "{} is at {}. It is {:.2f} meters away.".format(obj_name, direction, distance)
  engine.say(text)
  engine.runAndWait()

In [None]:
vid = cv2.VideoCapture(0)
fps = vid.get(cv2.CAP_PROP_FPS)

frame_width = vid.get(cv2.CAP_PROP_FRAME_WIDTH)
frame_height = vid.get(cv2.CAP_PROP_FRAME_HEIGHT)

print(f'Frames per second: {fps:0.2f}')

ct = 0
ret = True
while ret == True:

  ret, img = vid.read()

  if ct % 10 ==0:
    cv2.imwrite('test_frame.jpg', img, [cv2.IMWRITE_JPEG_QUALITY, 100])
    result = model.predict('test_frame.jpg')

    hand_check = detect_hand(result)

    if hand_check == True:
      print(f'Detected hand in video at frame {ct}')

    os.remove('test_frame.jpg')

  ct += 1

print(f'Number of frames {ct}')

Frames per second: 19.92

image 1/1 /content/test_frame.jpg: 384x640 1 person, 1189.4ms
Speed: 5.9ms preprocess, 1189.4ms inference, 1.8ms postprocess per image at shape (1, 3, 384, 640)

image 1/1 /content/test_frame.jpg: 384x640 1 person, 908.7ms
Speed: 2.5ms preprocess, 908.7ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)

image 1/1 /content/test_frame.jpg: 384x640 1 person, 1 remote, 1 cell phone, 930.3ms
Speed: 2.3ms preprocess, 930.3ms inference, 2.7ms postprocess per image at shape (1, 3, 384, 640)

image 1/1 /content/test_frame.jpg: 384x640 1 person, 1 cell phone, 1020.4ms
Speed: 2.7ms preprocess, 1020.4ms inference, 3.6ms postprocess per image at shape (1, 3, 384, 640)

image 1/1 /content/test_frame.jpg: 384x640 1 person, 1041.0ms
Speed: 2.9ms preprocess, 1041.0ms inference, 2.3ms postprocess per image at shape (1, 3, 384, 640)

image 1/1 /content/test_frame.jpg: 384x640 1 person, 1030.2ms
Speed: 2.7ms preprocess, 1030.2ms inference, 1.6ms postprocess per i

{0: 'person', 1: 'bicycle', 2: 'car', 3: 'motorcycle', 4: 'airplane', 5: 'bus', 6: 'train', 7: 'truck', 8: 'boat', 9: 'traffic light', 10: 'fire hydrant', 11: 'stop sign', 12: 'parking meter', 13: 'bench', 14: 'bird', 15: 'cat', 16: 'dog', 17: 'horse', 18: 'sheep', 19: 'cow', 20: 'elephant', 21: 'bear', 22: 'zebra', 23: 'giraffe', 24: 'backpack', 25: 'umbrella', 26: 'handbag', 27: 'tie', 28: 'suitcase', 29: 'frisbee', 30: 'skis', 31: 'snowboard', 32: 'sports ball', 33: 'kite', 34: 'baseball bat', 35: 'baseball glove', 36: 'skateboard', 37: 'surfboard', 38: 'tennis racket', 39: 'bottle', 40: 'wine glass', 41: 'cup', 42: 'fork', 43: 'knife', 44: 'spoon', 45: 'bowl', 46: 'banana', 47: 'apple', 48: 'sandwich', 49: 'orange', 50: 'broccoli', 51: 'carrot', 52: 'hot dog', 53: 'pizza', 54: 'donut', 55: 'cake', 56: 'chair', 57: 'couch', 58: 'potted plant', 59: 'bed', 60: 'dining table', 61: 'toilet', 62: 'tv', 63: 'laptop', 64: 'mouse', 65: 'remote', 66: 'keyboard', 67: 'cell phone', 68: 'microw