# Loading the Model

- Make sure to use 
    - ```pip install ultralytics```
    - ```pip install opencv-python```
    - ```pip install mediapipe```

In [1]:
from ultralytics import YOLO

#loading the model
model = YOLO('yolov8s.pt')

# Object Recognition in a single image

In [None]:
import cv2
from ultralytics import YOLO

# Load model
model = YOLO('yolov8s.pt')

# Load and process image
img = cv2.imread('IMG_0954.jpg')
results = model(img)[0]

# Save results using the correct method
# Option 1: Using YOLO's save method
results.save(filename='output.jpg')

# Option 2: Using OpenCV with plotted results
plotted_img = results.plot()
cv2.imwrite('output.jpg', plotted_img)

# Clean exit
cv2.destroyAllWindows()

### What if you want a different box

#### Syntax for OpenCV: 
##### Rectangle syntax
- cv2.rectangle(image, start_point, end_point, color, thickness)
- cv2.rectangle(img, (x1, y1), (x2, y2), (B, G, R), 2)

##### Text syntax
- cv2.putText(image, text, org, font, fontScale, color, thickness)
- cv2.putText(img, "Hello", (x, y), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (B, G, R), 2)

##### Common parameters:
- Colors: (B,G,R)
  - Blue: (255, 0, 0)
  - Green: (0, 255, 0)
  - Red: (0, 0, 255)
  - Black: (0, 0, 0)
  - White: (255, 255, 255)

- Fonts:
  - cv2.FONT_HERSHEY_SIMPLEX 
  - cv2.FONT_HERSHEY_PLAIN
  - cv2.FONT_HERSHEY_DUPLEX
  - cv2.FONT_HERSHEY_COMPLEX

In [None]:
import cv2
import numpy as np

#loading the image
img = cv2.imread('IMG_0954.jpg')

results = model(img)[0]

for result in results.boxes.data.tolist():
    
    x1, y1, x2, y2, score, class_id = result
    class_name = results.names[int(class_id)]
    
    print(f"Detected {class_name} with confidence {score:.2f} at [{x1}, {y1}, {x2}, {y2}]")
    
    #drawing the bounding box
    cv2.rectangle(img, (int(x1), int(y1)), (int(x2), int(y2)), (0, 0, 0), 2)
    cv2.putText(img, f"{class_name} {score:.2f}", (int(x1), int(y1)), cv2.FONT_HERSHEY_SIMPLEX, 2, (255, 0, 0), 6)
    
cv2.imshow('image', img)
cv2.waitKey(0)

cv2.imwrite('output.jpg', img)

#stop everything on q press
if cv2.waitKey(1) & 0xFF == ord('q'):
    cap.release()
    cv2.destroyAllWindows()
    exit(0)

## What if you want Video Capture ?


In [None]:
import cv2
import numpy as np
from ultralytics import YOLO

#using webcam (VIDEO CAPTURE)
cap = cv2.VideoCapture(0)

#(OPTIONAL) setting the resolution
cap.set(3, 1280)
cap.set(4, 720)

#loading the model
model = YOLO('yolov8s.pt')

while True:
    ret, frame = cap.read()
    
    if not ret:
        break
    
    results = model(frame)[0]
    
    for result in results.boxes.data.tolist():
        
        x1, y1, x2, y2, score, class_id = result
        class_name = results.names[int(class_id)]
        
        print(f"Detected {class_name} with confidence {score:.2f} at [{x1}, {y1}, {x2}, {y2}]")
        
        #drawing the bounding box
        cv2.rectangle(frame, (int(x1), int(y1)), (int(x2), int(y2)), (0, 0, 0), 2)
        cv2.putText(frame, f"{class_name} {score:.2f}", (int(x1), int(y1)), cv2.FONT_HERSHEY_SIMPLEX, 2, (255, 0, 0), 6)
        
    cv2.imshow('image', frame)
    
    #stop everything on q press
    if cv2.waitKey(1) & 0xFF == ord('q'):
        cap.release()
        cv2.destroyAllWindows()
        exit(0)

## Did you know you can track hands ?

In [None]:
import mediapipe as mp
import cv2
from ultralytics import YOLO

#loading the model
model = YOLO('yolov8s.pt')

# Initialize Mediapipe Hand Detection
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(min_detection_confidence=0.7)
mp_draw = mp.solutions.drawing_utils

# Webcam input
cap = cv2.VideoCapture(0)

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = hands.process(frame_rgb)

    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            mp_draw.draw_landmarks(frame, hand_landmarks,
                                   mp_hands.HAND_CONNECTIONS)

    cv2.imshow("Hand Skeleton Tracking", frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()

## What about gestures then ?

In [None]:
#!wget -q https://storage.googleapis.com/mediapipe-models/gesture_recognizer/gesture_recognizer/float16/1/gesture_recognizer.task

!curl -O https://storage.googleapis.com/mediapipe-models/gesture_recognizer/gesture_recognizer/float16/1/gesture_recognizer.task


In [None]:
import mediapipe as mp
import cv2
import time

# MediaPipe setup
mp_hands = mp.solutions.hands
mp_draw = mp.solutions.drawing_utils
hands = mp_hands.Hands(min_detection_confidence=0.7)

# Gesture recognizer setup
model_path = 'gesture_recognizer.task'
BaseOptions = mp.tasks.BaseOptions
GestureRecognizer = mp.tasks.vision.GestureRecognizer
GestureRecognizerOptions = mp.tasks.vision.GestureRecognizerOptions
VisionRunningMode = mp.tasks.vision.RunningMode

# Global variable for gesture results
gesture_texts = []

def print_result(result, output_image, timestamp_ms):
    global gesture_texts
    gesture_texts = []
    if result.gestures and result.handedness:
        for hand, gesture in zip(result.handedness, result.gestures):
            gesture_texts.append((
                hand[0].category_name,
                gesture[0].category_name,
                gesture[0].score
            ))

options = GestureRecognizerOptions(
    base_options=BaseOptions(model_asset_path=model_path),
    running_mode=VisionRunningMode.LIVE_STREAM,
    result_callback=print_result
)

try:
    with GestureRecognizer.create_from_options(options) as recognizer:
        cap = cv2.VideoCapture(0)
        timestamp = 0
        
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break

            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            
            # Process hands
            hand_results = hands.process(frame_rgb)
            if hand_results.multi_hand_landmarks:
                for landmarks in hand_results.multi_hand_landmarks:
                    mp_draw.draw_landmarks(
                        frame, landmarks, mp_hands.HAND_CONNECTIONS,
                        mp_draw.DrawingSpec(color=(121, 22, 76), thickness=2, circle_radius=4),
                        mp_draw.DrawingSpec(color=(250, 44, 250), thickness=2)
                    )
            
            # Process gestures
            mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=frame_rgb)
            timestamp += 1
            recognizer.recognize_async(mp_image, timestamp)

            # Draw gesture labels
            for idx, (hand, gesture, score) in enumerate(gesture_texts):
                cv2.putText(frame, f"{gesture} ({score:.2f})", 
                           (10, 30 + idx * 30),
                           cv2.FONT_HERSHEY_SIMPLEX, 1, 
                           (0, 255, 0), 2, cv2.LINE_AA)

            cv2.imshow('Hand Gesture Recognition', frame)
            if cv2.waitKey(1) & 0xFF == ord('q'):
                break

finally:
    if 'cap' in locals():
        cap.release()
    cv2.destroyAllWindows()
    hands.close()

In [None]:
import mediapipe as mp
import cv2
import time
import math
import os

# MediaPipe setup
mp_hands = mp.solutions.hands
mp_draw = mp.solutions.drawing_utils
hands = mp_hands.Hands(min_detection_confidence=0.7)

# Gesture recognizer setup
model_path = 'gesture_recognizer.task'
BaseOptions = mp.tasks.BaseOptions
GestureRecognizer = mp.tasks.vision.GestureRecognizer
GestureRecognizerOptions = mp.tasks.vision.GestureRecognizerOptions
VisionRunningMode = mp.tasks.vision.RunningMode

# Global variables
gesture_texts = []
volume_mode_active = False
volume_toggle_ready = True  

def print_result(result, output_image, timestamp_ms):
    global gesture_texts
    gesture_texts = []
    if result.gestures and result.handedness:
        for hand, gesture in zip(result.handedness, result.gestures):
            gesture_texts.append((
                hand[0].category_name,
                gesture[0].category_name,
                gesture[0].score
            ))

def calculate_distance(landmark1, landmark2):
    return math.sqrt(
        (landmark1.x - landmark2.x) ** 2 +
        (landmark1.y - landmark2.y) ** 2 +
        (landmark1.z - landmark2.z) ** 2
    )

def adjust_volume(change):
    if change > 0:
        os.system("osascript -e 'set volume output volume (output volume of (get volume settings) + 5)'")
    elif change < 0:
        os.system("osascript -e 'set volume output volume (output volume of (get volume settings) - 5)'")

# Default distance threshold
default_distance = None

options = GestureRecognizerOptions(
    base_options=BaseOptions(model_asset_path=model_path),
    running_mode=VisionRunningMode.LIVE_STREAM,
    result_callback=print_result
)

try:
    with GestureRecognizer.create_from_options(options) as recognizer:
        cap = cv2.VideoCapture(0)
        timestamp = 0
        
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break

            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            
            # Process hands
            hand_results = hands.process(frame_rgb)
            if hand_results.multi_hand_landmarks:
                for landmarks in hand_results.multi_hand_landmarks:
                    mp_draw.draw_landmarks(
                        frame, landmarks, mp_hands.HAND_CONNECTIONS,
                        mp_draw.DrawingSpec(color=(121, 22, 76), thickness=2, circle_radius=4),
                        mp_draw.DrawingSpec(color=(250, 44, 250), thickness=2)
                    )

                    # Detect middle finger and thumb pinch gesture to toggle volume mode
                    thumb_tip = landmarks.landmark[4]
                    middle_tip = landmarks.landmark[12]
                    pinch_distance = calculate_distance(thumb_tip, middle_tip)

                    if pinch_distance < 0.05 and volume_toggle_ready:  # Threshold for pinch gesture
                        volume_mode_active = not volume_mode_active
                        mode_status = "ON" if volume_mode_active else "OFF"
                        volume_toggle_ready = False  # Prevent multiple toggles in one pinch
                        cv2.putText(frame, f"Volume Mode {mode_status}", 
                                   (50, 100),
                                   cv2.FONT_HERSHEY_SIMPLEX, 1, 
                                   (255, 255, 0), 2, cv2.LINE_AA)
                        print(f"Volume Mode {mode_status}")

                    if pinch_distance > 0.1:  # Reset toggle readiness when fingers move apart
                        volume_toggle_ready = True

                    # If volume mode is active, process thumb and index finger gestures for volume control
                    if volume_mode_active:
                        index_tip = landmarks.landmark[8]
                        thumb_tip = landmarks.landmark[4]
                        distance = calculate_distance(thumb_tip, index_tip)

                        if default_distance is None:
                            default_distance = distance  # Set the initial distance as default
                            print(f"Default distance set: {default_distance}")
                        else:
                            if distance < default_distance * 0.8:  # Threshold for volume up
                                cv2.putText(frame, "Volume Up!", 
                                           (50, 50),
                                           cv2.FONT_HERSHEY_SIMPLEX, 1, 
                                           (0, 255, 0), 2, cv2.LINE_AA)
                                adjust_volume(1)
                                print("Volume Up Gesture Detected")
                            elif distance > default_distance * 1.2:  # Threshold for volume down
                                cv2.putText(frame, "Volume Down!", 
                                           (50, 50),
                                           cv2.FONT_HERSHEY_SIMPLEX, 1, 
                                           (0, 0, 255), 2, cv2.LINE_AA)
                                adjust_volume(-1)
                                print("Volume Down Gesture Detected")

            # Process gestures
            mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=frame_rgb)
            timestamp += 1
            recognizer.recognize_async(mp_image, timestamp)

            # Draw gesture labels
            for idx, (hand, gesture, score) in enumerate(gesture_texts):
                cv2.putText(frame, f"{gesture} ({score:.2f})", 
                           (10, 30 + idx * 30),
                           cv2.FONT_HERSHEY_SIMPLEX, 1, 
                           (0, 255, 0), 2, cv2.LINE_AA)

            cv2.imshow('Hand Gesture Recognition', frame)
            if cv2.waitKey(1) & 0xFF == ord('q'):
                break

finally:
    if 'cap' in locals():
        cap.release()
    cv2.destroyAllWindows()
    hands.close()
