In [None]:
#pull out thumbnail image of item that leaves the scene, the occlusion handling is broken atm so clipped videos preferred

from ultralytics import YOLO
import cv2
import os
import time

def iou(boxA, boxB):
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])
    interArea = max(0, xB - xA) * max(0, yB - yA)  # Area of intersection
    boxAArea = (boxA[2] - boxA[0]) * (boxA[3] - boxA[1])  # Area of boxA
    boxBArea = (boxB[2] - boxB[0]) * (boxB[3] - boxB[1])  # Area of boxB
    if boxAArea == 0 or boxBArea == 0:
        return 0.0
    return interArea / float(boxAArea + boxBArea - interArea)

MAX_THUMBNAIL_WIDTH = 100

def overlay_object_images(frame, objects_that_left, object_images):
    height, width = frame.shape[:2]
    img_offset = 10
    
    y_start = height  # Start from bottom and go upwards

    for idx, lbl in enumerate(objects_that_left):
        if lbl in object_images:
            try:
                obj_img = object_images[lbl]
                if obj_img is None or obj_img.size == 0:
                    continue
                
                # Calculate the aspect ratio
                orig_h, orig_w = obj_img.shape[:2]
                scale = MAX_THUMBNAIL_WIDTH / float(orig_w)
                
                new_w = int(orig_w * scale)
                new_h = int(orig_h * scale)
                
                # Resize with same aspect ratio
                obj_img_resized = cv2.resize(obj_img, (new_w, new_h))
                
                # Now compute top-left corner so the thumbnail is placed above the previous one
                # Let's assume y_start decreases by new_h + offset each iteration
                y_start -= (new_h + img_offset)
                x_start = width - (new_w + img_offset)
                
                # Bounds check
                if x_start < 0 or y_start < 0:
                    continue
                
                # Overlay the object image
                frame[y_start:y_start+new_h, x_start:x_start+new_w] = obj_img_resized
                
                # Add label text above the thumbnail
                cv2.putText(frame, f"Object {lbl}",
                            (x_start, y_start - 5),
                            cv2.FONT_HERSHEY_SIMPLEX, 
                            0.5, 
                            (255, 255, 255),
                            1)
            
            except Exception as e:
                print(f"Error processing object {lbl}: {str(e)}")
                continue
    
    return frame

# ------------------------------------------------------------
# Configuration
# ------------------------------------------------------------
model_path = r"C:\Users\joshu\Documents\EEE\AMNIS\Models\training\weights\best.pt"
video_path = r"C:\Users\joshu\Downloads\start_end_cornflakes.mp4"
save_dir = r"C:\Users\joshu\Documents\EEE\AMNIS\output_videos"
os.makedirs(save_dir, exist_ok=True)

# Detection thresholds
detection_conf_thresh = 0.01   # Confidence threshold for YOLO detections
detection_nms_thresh = 0.05   # NMS IOU threshold for YOLO detections

# Tracking thresholds
tracking_iou_thresh = 0.5     # IOU threshold for object tracking/matching
shelf_thresh = 100           # Vertical distance threshold for shelf grouping
lost_time_limit = 500        # Frames before declaring object as permanently lost

tracker_config = 'bytetrack.yaml'

# Initialize video capture to get frame for ROI selection
cap = cv2.VideoCapture(video_path)
ret, first_frame = cap.read()
if not ret:
    raise RuntimeError("Could not read first frame")

# Get ROI from user
print("Select ROI and press ENTER. Press C to cancel.")
roi = cv2.selectROI("Select ROI", first_frame, fromCenter=False, showCrosshair=True)
cv2.destroyWindow("Select ROI")
x_min, y_min, w, h = roi
x_max, y_max = x_min + w, y_min + h

# Get video properties
fps = int(cap.get(cv2.CAP_PROP_FPS))
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
cap.release()

# Load YOLO model
model = YOLO(model_path)

# Initialize the tracking generator
generator = model.track(
    source=video_path,
    imgsz=640,
    conf=detection_conf_thresh,
    iou=detection_nms_thresh,
    tracker=tracker_config,
    stream=True
)

# Prepare video writer
output_video_path = os.path.join(save_dir, "cornflakes.mp4")
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out_writer = cv2.VideoWriter(output_video_path, fourcc, fps, (width, height))

# State management
active_objects = []      # List of (label, box) tuples for current objects
lost_objects = {}        # Dict of label: (box, last_seen_frame) for lost objects
next_label_num = 1      # Counter for generating unique object labels
objects_that_left = set()  # Set of labels for objects that have left the scene
frame_count = 0
box_color = (0, 255, 0)
object_first_seen = {}  # Dictionary to store the first frame and bounding box of each object
object_images = {}      # Dictionary to store object images for display
last_valid_images = {}  # Dictionary to store the last valid image of each object

# Main processing loop
while True:
    try:
        results = next(generator)
    except StopIteration:
        break

    frame_count += 1
    frame = results.orig_img.copy()

    # Filter boxes to only include those within ROI
    boxes = []
    for b in results.boxes:
        x1, y1, x2, y2 = map(int, b.xyxy[0])
        if (x_min <= x1 <= x_max and x_min <= x2 <= x_max and
            y_min <= y1 <= y_max and y_min <= y2 <= y_max):
            boxes.append((x1, y1, x2, y2))

    if boxes:
        # Group boxes into shelves
        boxes.sort(key=lambda b: b[3])
        shelves = []
        for bx in boxes:
            x1, y1, x2, y2 = bx
            placed = False
            for shelf in shelves:
                shelf_bottoms = [b[3] for b in shelf]
                shelf_mean_bottom = sum(shelf_bottoms) / len(shelf_bottoms)
                if abs(y2 - shelf_mean_bottom) <= shelf_thresh:
                    shelf.append(bx)
                    placed = True
                    break
            if not placed:
                shelves.append([bx])

        # Sort shelves and create final box list
        for shelf in shelves:
            shelf.sort(key=lambda b: b[0])
        sorted_boxes = [b for shelf in shelves for b in shelf]

        # Track objects
        new_objects = []
        used_active_labels = set()

        for new_box in sorted_boxes:
            # Try to match with active objects
            best_iou = 0
            best_label = None
            for (lbl, a_box) in active_objects:
                iou_val = iou(a_box, new_box)
                if iou_val > best_iou:
                    best_iou = iou_val
                    best_label = lbl

            if best_iou > tracking_iou_thresh:
                new_objects.append((best_label, new_box))
                used_active_labels.add(best_label)
                # Store the current image of the object while it's still visible
                x1, y1, x2, y2 = new_box
                if y1 < frame.shape[0] and y2 < frame.shape[0] and x1 < frame.shape[1] and x2 < frame.shape[1]:
                    last_valid_images[best_label] = frame[y1:y2, x1:x2].copy()
            else:
                # Assign a new label
                new_objects.append((next_label_num, new_box))
                object_first_seen[next_label_num] = (frame_count, new_box)
                # Store initial image of new object
                x1, y1, x2, y2 = new_box
                if y1 < frame.shape[0] and y2 < frame.shape[0] and x1 < frame.shape[1] and x2 < frame.shape[1]:
                    last_valid_images[next_label_num] = frame[y1:y2, x1:x2].copy()
                used_active_labels.add(next_label_num)
                next_label_num += 1

        # Handle disappeared objects
        active_labels = {lbl for (lbl, _) in active_objects}
        disappeared = active_labels - used_active_labels
        for lbl in disappeared:
            for (l, box) in active_objects:
                if l == lbl:
                    lost_objects[lbl] = (box, frame_count)
                    objects_that_left.add(lbl)
                    
                    # Use the last valid image we stored
                    if lbl in last_valid_images:
                        object_images[lbl] = last_valid_images[lbl]
                        # Optionally save to disk
                        image_path = os.path.join(save_dir, f"object_{lbl}.png")
                        cv2.imwrite(image_path, last_valid_images[lbl])
                        print(f"Saved image for object {lbl} at {image_path}")

        # Clean up lost objects
        lost_objects = {
            lbl: (box, frame) 
            for lbl, (box, frame) in lost_objects.items() 
            if frame_count - frame <= lost_time_limit
        }

        active_objects = new_objects

        # Draw visualizations
        # Draw ROI
        cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (255, 0, 0), 2)

        # Draw bounding boxes and labels
        for (lbl, (x1, y1, x2, y2)) in active_objects:
            cv2.rectangle(frame, (x1, y1), (x2, y2), box_color, 2)
            cv2.putText(frame, f"{lbl}", (x1 + 5, y1 + 20), 
                       cv2.FONT_HERSHEY_SIMPLEX, 0.7, box_color, 2)

        # Draw shelf boundaries
        for idx, shelf in enumerate(shelves, start=1):
            y = min(b[3] for b in shelf)
            cv2.line(frame, (0, y), (width, y), (0, 0, 255), 2)
            cv2.putText(frame, f"Shelf {idx}", (10, y - 10), 
                       cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 255), 2)

        # Use the overlay_object_images function
        frame = overlay_object_images(frame, objects_that_left, object_images)

        # Draw status information
        if objects_that_left:
            text = f"Objects that have left: {', '.join(map(str, objects_that_left))}"
            cv2.putText(frame, text, (100, 90), 
                       cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2)

        cv2.putText(frame, f"Total objects in scene: {len(active_objects)}", 
                   (100, 45), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)

    else:
        # Handle case when no boxes are detected
        for (lbl, box) in active_objects:
            lost_objects[lbl] = (box, frame_count)
            objects_that_left.add(lbl)
        active_objects = []

    out_writer.write(frame)

cap.release()
out_writer.release()
print(f"Processed video saved to {output_video_path}")

# Play the processed video
def play_video(video_path):
    cap = cv2.VideoCapture(video_path)
    
    # Get video properties
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    frame_delay = int(1000/fps)  # Convert fps to milliseconds delay between frames
    
    while True:
        ret, frame = cap.read()
        if not ret:
            break
            
        cv2.imshow('Processed Video', frame)
        
        # Break loop if 'q' is pressed or window is closed
        key = cv2.waitKey(frame_delay) & 0xFF
        if key == ord('q') or cv2.getWindowProperty('Processed Video', cv2.WND_PROP_VISIBLE) < 1:
            break
    
    cap.release()
    cv2.destroyAllWindows()

# Play the processed video
print("Playing processed video (press 'q' to quit)...")
play_video(output_video_path)

Select ROI and press ENTER. Press C to cancel.

video 1/1 (frame 1/87) C:\Users\joshu\Downloads\start_end_cornflakes.mp4: 384x640 30 obs, 25.2ms
video 1/1 (frame 2/87) C:\Users\joshu\Downloads\start_end_cornflakes.mp4: 384x640 30 obs, 27.0ms
video 1/1 (frame 3/87) C:\Users\joshu\Downloads\start_end_cornflakes.mp4: 384x640 30 obs, 23.0ms
video 1/1 (frame 4/87) C:\Users\joshu\Downloads\start_end_cornflakes.mp4: 384x640 30 obs, 19.4ms
video 1/1 (frame 5/87) C:\Users\joshu\Downloads\start_end_cornflakes.mp4: 384x640 30 obs, 22.0ms
video 1/1 (frame 6/87) C:\Users\joshu\Downloads\start_end_cornflakes.mp4: 384x640 30 obs, 22.0ms
video 1/1 (frame 7/87) C:\Users\joshu\Downloads\start_end_cornflakes.mp4: 384x640 30 obs, 25.0ms
video 1/1 (frame 8/87) C:\Users\joshu\Downloads\start_end_cornflakes.mp4: 384x640 30 obs, 22.0ms
video 1/1 (frame 9/87) C:\Users\joshu\Downloads\start_end_cornflakes.mp4: 384x640 30 obs, 20.0ms
video 1/1 (frame 10/87) C:\Users\joshu\Downloads\start_end_cornflakes.mp4: 384x

In [None]:
"""
Object Tracking System using YOLO and OpenCV
------------------------------------------
This script implements an object tracking system that:
1. Detects objects using YOLO
2. Tracks objects across frames
3. Captures thumbnails of objects that leave the scene
4. Displays tracking information and object history

Key Features:
- ROI-based tracking
- Shelf detection and visualization
- Object persistence tracking
- Thumbnail capture and display
- Real-time visualization
"""

from ultralytics import YOLO
import cv2
import os
import time

def iou(boxA, boxB):
    """
    Calculate the Intersection over Union (IoU) between two bounding boxes.
    
    Args:
        boxA, boxB: Tuples of (x1, y1, x2, y2) coordinates
    
    Returns:
        float: IoU score between 0 and 1
    """
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])
    interArea = max(0, xB - xA) * max(0, yB - yA)
    boxAArea = (boxA[2] - boxA[0]) * (boxA[3] - boxA[1])
    boxBArea = (boxB[2] - boxB[0]) * (boxB[3] - boxB[1])
    if boxAArea == 0 or boxBArea == 0:
        return 0.0
    return interArea / float(boxAArea + boxBArea - interArea)

MAX_THUMBNAIL_WIDTH = 100  # Maximum width for thumbnail display

def overlay_object_images(frame, objects_that_left, object_images):
    """
    Overlay thumbnails of tracked objects that have left the scene.
    
    Args:
        frame: Current video frame
        objects_that_left: Set of object labels that have left the scene
        object_images: Dictionary of object labels to their thumbnail images
    
    Returns:
        frame: Modified frame with overlaid thumbnails
    """
    height, width = frame.shape[:2]
    img_offset = 10
    
    y_start = height  # Start positioning thumbnails from bottom of frame
    
    for idx, lbl in enumerate(objects_that_left):
        if lbl in object_images:
            try:
                obj_img = object_images[lbl]
                if obj_img is None or obj_img.size == 0:
                    continue
                
                # Calculate thumbnail size maintaining aspect ratio
                orig_h, orig_w = obj_img.shape[:2]
                scale = MAX_THUMBNAIL_WIDTH / float(orig_w)
                new_w = int(orig_w * scale)
                new_h = int(orig_h * scale)
                
                obj_img_resized = cv2.resize(obj_img, (new_w, new_h))
                
                # Position thumbnail above previous one
                y_start -= (new_h + img_offset)
                x_start = width - (new_w + img_offset)
                
                # Skip if thumbnail would be outside frame
                if x_start < 0 or y_start < 0:
                    continue
                
                # Overlay thumbnail and label
                frame[y_start:y_start+new_h, x_start:x_start+new_w] = obj_img_resized
                cv2.putText(frame, f"Object {lbl}",
                          (x_start, y_start - 5),
                          cv2.FONT_HERSHEY_SIMPLEX, 
                          0.5, 
                          (255, 255, 255),
                          1)
            
            except Exception as e:
                print(f"Error processing object {lbl}: {str(e)}")
                continue
    
    return frame

# ------------------------------------------------------------
# Configuration Parameters
# ------------------------------------------------------------
model_path = r"C:\Users\joshu\Documents\EEE\AMNIS\Models\training\weights\best.pt"
video_path = r"C:\Users\joshu\Downloads\start_end_cornflakes.mp4"
save_dir = r"C:\Users\joshu\Documents\EEE\AMNIS\output_videos"
os.makedirs(save_dir, exist_ok=True)

# Detection and tracking thresholds
detection_conf_thresh = 0.01   # YOLO detection confidence threshold
detection_nms_thresh = 0.05    # Non-maximum suppression IOU threshold
tracking_iou_thresh = 0.5      # IOU threshold for frame-to-frame tracking
shelf_thresh = 100            # Vertical pixel threshold for shelf grouping
lost_time_limit = 500         # Frames before considering object permanently lost

tracker_config = 'bytetrack.yaml'

# Initialize video capture and get ROI selection
cap = cv2.VideoCapture(video_path)
ret, first_frame = cap.read()
if not ret:
    raise RuntimeError("Could not read first frame")

# Get ROI from user interaction
print("Select ROI and press ENTER. Press C to cancel.")
roi = cv2.selectROI("Select ROI", first_frame, fromCenter=False, showCrosshair=True)
cv2.destroyWindow("Select ROI")
x_min, y_min, w, h = roi
x_max, y_max = x_min + w, y_min + h

# Get video properties for output
fps = int(cap.get(cv2.CAP_PROP_FPS))
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
cap.release()

# Initialize YOLO model and tracking
model = YOLO(model_path)
generator = model.track(
    source=video_path,
    imgsz=640,
    conf=detection_conf_thresh,
    iou=detection_nms_thresh,
    tracker=tracker_config,
    stream=True
)

# Setup video writer
output_video_path = os.path.join(save_dir, "cornflakes.mp4")
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out_writer = cv2.VideoWriter(output_video_path, fourcc, fps, (width, height))

# Initialize state tracking variables
active_objects = []           # Currently visible objects
lost_objects = {}            # Objects that have disappeared
next_label_num = 1           # Counter for unique object labels
objects_that_left = set()    # Objects that have left the scene
frame_count = 0
box_color = (0, 255, 0)      # Color for bounding boxes
object_first_seen = {}       # Track when objects first appear
object_images = {}           # Store object thumbnails
last_valid_images = {}       # Keep last good image of each object

# Main processing loop
while True:
    try:
        results = next(generator)
    except StopIteration:
        break

    frame_count += 1
    frame = results.orig_img.copy()

    # Filter detections to ROI
    boxes = []
    for b in results.boxes:
        x1, y1, x2, y2 = map(int, b.xyxy[0])
        if (x_min <= x1 <= x_max and x_min <= x2 <= x_max and
            y_min <= y1 <= y_max and y_min <= y2 <= y_max):
            boxes.append((x1, y1, x2, y2))

    if boxes:
        # Group boxes into shelves based on vertical position
        boxes.sort(key=lambda b: b[3])  # Sort by bottom y-coordinate
        shelves = []
        for bx in boxes:
            x1, y1, x2, y2 = bx
            placed = False
            for shelf in shelves:
                shelf_bottoms = [b[3] for b in shelf]
                shelf_mean_bottom = sum(shelf_bottoms) / len(shelf_bottoms)
                if abs(y2 - shelf_mean_bottom) <= shelf_thresh:
                    shelf.append(bx)
                    placed = True
                    break
            if not placed:
                shelves.append([bx])

        # Sort boxes within each shelf left-to-right
        for shelf in shelves:
            shelf.sort(key=lambda b: b[0])
        sorted_boxes = [b for shelf in shelves for b in shelf]

        # Track objects across frames
        new_objects = []
        used_active_labels = set()

        for new_box in sorted_boxes:
            # Try to match with existing objects
            best_iou = 0
            best_label = None
            for (lbl, a_box) in active_objects:
                iou_val = iou(a_box, new_box)
                if iou_val > best_iou:
                    best_iou = iou_val
                    best_label = lbl

            if best_iou > tracking_iou_thresh:
                # Matched existing object
                new_objects.append((best_label, new_box))
                used_active_labels.add(best_label)
                # Update object image
                x1, y1, x2, y2 = new_box
                if y1 < frame.shape[0] and y2 < frame.shape[0] and x1 < frame.shape[1] and x2 < frame.shape[1]:
                    last_valid_images[best_label] = frame[y1:y2, x1:x2].copy()
            else:
                # New object detected
                new_objects.append((next_label_num, new_box))
                object_first_seen[next_label_num] = (frame_count, new_box)
                x1, y1, x2, y2 = new_box
                if y1 < frame.shape[0] and y2 < frame.shape[0] and x1 < frame.shape[1] and x2 < frame.shape[1]:
                    last_valid_images[next_label_num] = frame[y1:y2, x1:x2].copy()
                used_active_labels.add(next_label_num)
                next_label_num += 1

        # Handle objects that disappeared
        active_labels = {lbl for (lbl, _) in active_objects}
        disappeared = active_labels - used_active_labels
        for lbl in disappeared:
            for (l, box) in active_objects:
                if l == lbl:
                    lost_objects[lbl] = (box, frame_count)
                    objects_that_left.add(lbl)
                    
                    # Save thumbnail of disappeared object
                    if lbl in last_valid_images:
                        object_images[lbl] = last_valid_images[lbl]
                        image_path = os.path.join(save_dir, f"object_{lbl}.png")
                        cv2.imwrite(image_path, last_valid_images[lbl])
                        print(f"Saved image for object {lbl} at {image_path}")

        # Clean up old lost objects
        lost_objects = {
            lbl: (box, frame) 
            for lbl, (box, frame) in lost_objects.items() 
            if frame_count - frame <= lost_time_limit
        }

        active_objects = new_objects

        # Visualization
        # Draw ROI boundary
        cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (255, 0, 0), 2)

        # Draw object boxes and labels
        for (lbl, (x1, y1, x2, y2)) in active_objects:
            cv2.rectangle(frame, (x1, y1), (x2, y2), box_color, 2)
            cv2.putText(frame, f"{lbl}", (x1 + 5, y1 + 20), 
                       cv2.FONT_HERSHEY_SIMPLEX, 0.7, box_color, 2)

        # Draw shelf boundaries
        for idx, shelf in enumerate(shelves, start=1):
            y = min(b[3] for b in shelf)
            cv2.line(frame, (0, y), (width, y), (0, 0, 255), 2)
            cv2.putText(frame, f"Shelf {idx}", (10, y - 10), 
                       cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 255), 2)

        # Overlay thumbnails of departed objects
        frame = overlay_object_images(frame, objects_that_left, object_images)

        # Draw status information
        if objects_that_left:
            text = f"Objects that have left: {', '.join(map(str, objects_that_left))}"
            cv2.putText(frame, text, (100, 90), 
                       cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2)

        cv2.putText(frame, f"Total objects in scene: {len(active_objects)}", 
                   (100, 45), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)

    else:
        # Handle frame with no detections
        for (lbl, box) in active_objects:
            lost_objects[lbl] = (box, frame_count)
            objects_that_left.add(lbl)
        active_objects = []

    out_writer.write(frame)

# Cleanup
cap.release()
out_writer.release()
print(f"Processed video saved to {output_video_path}")

def play_video(video_path):
    """
    Play the processed video with basic playback controls.
    
    Args:
        video_path: Path to the video file to play
    """
    cap = cv2.VideoCapture(video_path)
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    frame_delay = int(1000/fps)
    
    while True:
        ret, frame = cap.read()
        if not ret:
            break
            
        cv2.imshow('Processed Video', frame)
        
        key = cv2.waitKey(frame_delay) & 0xFF
        if key == ord('q') or cv2.getWindowProperty('Processed Video', cv2.WND_PROP_VISIBLE) < 1:
            break
    
    cap.release()
    cv2.destroyAllWindows()

# Play the processed video
print("Playing processed video (press 'q' to quit)...")
play_video(output_video_path)

Select ROI and press ENTER. Press C to cancel.

video 1/1 (frame 1/87) C:\Users\joshu\Downloads\start_end_cornflakes.mp4: 384x640 30 obs, 17.0ms
video 1/1 (frame 2/87) C:\Users\joshu\Downloads\start_end_cornflakes.mp4: 384x640 30 obs, 17.1ms
video 1/1 (frame 3/87) C:\Users\joshu\Downloads\start_end_cornflakes.mp4: 384x640 30 obs, 16.0ms
video 1/1 (frame 4/87) C:\Users\joshu\Downloads\start_end_cornflakes.mp4: 384x640 30 obs, 16.0ms
video 1/1 (frame 5/87) C:\Users\joshu\Downloads\start_end_cornflakes.mp4: 384x640 30 obs, 15.8ms
video 1/1 (frame 6/87) C:\Users\joshu\Downloads\start_end_cornflakes.mp4: 384x640 30 obs, 15.7ms
video 1/1 (frame 7/87) C:\Users\joshu\Downloads\start_end_cornflakes.mp4: 384x640 30 obs, 16.1ms
video 1/1 (frame 8/87) C:\Users\joshu\Downloads\start_end_cornflakes.mp4: 384x640 30 obs, 15.5ms
video 1/1 (frame 9/87) C:\Users\joshu\Downloads\start_end_cornflakes.mp4: 384x640 30 obs, 11.5ms
video 1/1 (frame 10/87) C:\Users\joshu\Downloads\start_end_cornflakes.mp4: 384x

In [10]:
import os
import cv2
import pytesseract
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

import numpy as np
from PIL import Image

def run_ocr_on_image(image_path, output_debug=False):
    """
    Takes an image, preprocesses it, and runs OCR to extract text.

    Args:
        image_path (str): Path to the image file.
        output_debug (bool): If True, saves intermediate debug images.

    Returns:
        str: The OCR text extracted from the image.
    """
    debug_folder = r"C:\Users\joshu\Documents\GitHub\object_detection\assets\OCR_debug"

    try:
        # Create the debug folder if it doesn't exist
        if output_debug:
            os.makedirs(debug_folder, exist_ok=True)

        # Load image using OpenCV
        image = cv2.imread(image_path)
        if image is None:
            raise ValueError(f"Could not read image from {image_path}")

        print(f"[INFO] Loaded image from {image_path}, size: {image.shape}")

        # Step 1: Upscale the image
        scale_factor = 4
        upscaled = cv2.resize(
            image,
            (image.shape[1] * scale_factor, image.shape[0] * scale_factor),
            interpolation=cv2.INTER_CUBIC
        )
        print(f"[INFO] Upscaled image size: {upscaled.shape}")

        if output_debug:
            debug_upscaled_path = os.path.join(debug_folder, "debug_upscaled.png")
            cv2.imwrite(debug_upscaled_path, upscaled)
            print(f"[DEBUG] Saved upscaled image to {debug_upscaled_path}")

        # Step 2: Convert to grayscale
        gray = cv2.cvtColor(upscaled, cv2.COLOR_BGR2GRAY)
        print("[INFO] Converted image to grayscale.")

        if output_debug:
            debug_grayscale_path = os.path.join(debug_folder, "debug_grayscale.png")
            cv2.imwrite(debug_grayscale_path, gray)
            print(f"[DEBUG] Saved grayscale image to {debug_grayscale_path}")

        # Step 3: Apply Otsu's thresholding
        _, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
        print("[INFO] Applied Otsu's thresholding.")

        if output_debug:
            debug_thresholded_path = os.path.join(debug_folder, "debug_thresholded.png")
            cv2.imwrite(debug_thresholded_path, thresh)
            print(f"[DEBUG] Saved thresholded image to {debug_thresholded_path}")

        # Step 4: Run Tesseract OCR
        config = r'--psm 6 --oem 3 -l eng'  # PSM 6: Assume a single block of text
        ocr_text = pytesseract.image_to_string(thresh, config=config)

        print("[INFO] OCR completed. Extracted text:")
        print(ocr_text)

        return ocr_text

    except Exception as e:
        print(f"[ERROR] Failed to process image: {str(e)}")
        return ""

if __name__ == "__main__":
    # Path to the object photo
    image_path = r"C:\Users\joshu\Documents\EEE\AMNIS\output_videos\object_19.png"

    # Run OCR on the image
    extracted_text = run_ocr_on_image(image_path, output_debug=True)

    print("\nFinal OCR Output:")
    print(extracted_text)


[INFO] Loaded image from C:\Users\joshu\Documents\EEE\AMNIS\output_videos\object_19.png, size: (115, 43, 3)
[INFO] Upscaled image size: (460, 172, 3)
[DEBUG] Saved upscaled image to C:\Users\joshu\Documents\GitHub\object_detection\assets\OCR_debug\debug_upscaled.png
[INFO] Converted image to grayscale.
[DEBUG] Saved grayscale image to C:\Users\joshu\Documents\GitHub\object_detection\assets\OCR_debug\debug_grayscale.png
[INFO] Applied Otsu's thresholding.
[DEBUG] Saved thresholded image to C:\Users\joshu\Documents\GitHub\object_detection\assets\OCR_debug\debug_thresholded.png
[INFO] OCR completed. Extracted text:
i
| Come
La a)


Final OCR Output:
i
| Come
La a)

