In [1]:
import os

NOTEBOOKS_DIR = os.getcwd()
HOME_DIR = os.path.dirname(NOTEBOOKS_DIR)

def get_next_directory(base_path="output/predict"):
    # Check if the base directory exists
    if not os.path.exists(base_path):
        return base_path
    else:
        # Find the next available numbered directory
        i = 2
        while os.path.exists(f"{base_path}{i}"):
            i += 1
        return f"{base_path}{i}"

# Perform object detection

Perform the inference with the loaded model.

In [2]:
from ultralytics import YOLO

# Load a model
#model = YOLO("yolo11s.pt", )  # load an official model
# model = YOLO("yolo11s-seg.pt")  # load an official model (instance segmentation)
model = YOLO(HOME_DIR+"/runs/obb/train/weights/best.pt")  # load an official model (Oriented Bounding Boxes Object Detection)
#model = YOLO(HOME_DIR+"/runs/segment/train/weights/best.pt")  # load my custom model

# source = 'https://ultralytics.com/images/bus.jpg'
# source = HOME_DIR+'/example-files/IMG_3688.png'
# source = HOME_DIR+'/example-files/books'
# source = HOME_DIR+'/example-files/books.mov'
source = HOME_DIR+'/example-files/books/books_00005.png'

# Predict with the model
results = model.predict(source, conf=0.5)  


image 1/1 /Users/andreas/Documents/Projekte/Objekterkennung.yolo11/example-files/books/books_00005.png: 480x640 225.3ms
Speed: 2.1ms preprocess, 225.3ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)


# Adjust images (OBB only)

From: https://github.com/ultralytics/ultralytics/issues/9344

Switching to an OBB (oriented bounding box) model means you'll be working with rotated bounding boxes. The .boxes attribute for an OBB model will contain the center coordinates, width, height, and angle in radians.

I took the following crop_rect function from here, as it also rotates the image: https://github.com/ultralytics/ultralytics/issues/13650

In [3]:
import cv2
from ultralytics import YOLO
import numpy as np
import os

def crop_rect(img, rect, interpolation=cv2.INTER_CUBIC):
    """
    Extracts and rectifies a rotated bounding box from an image.

    This function takes an image and an oriented bounding box (OBB), rotates the 
    image such that the bounding box becomes axis-aligned (rectangular), and then 
    crops the bounding box area.

    Args:
        img (numpy.ndarray): The input image from which the bounding box is extracted.
        rect (tuple): The oriented bounding box parameters.
                     - rect[0]: Center coordinates of the bounding box (x, y).
                     - rect[1]: Size of the bounding box (width, height).
                     - rect[2]: Rotation angle of the bounding box (in degrees).
        interpolation (int, optional): Interpolation method used when rotating the image.
                                       Defaults to cv2.INTER_CUBIC.

    Returns:
        cropped_image (numpy.ndarray): The cropped rectangle region from the rotated image.
    """

    # Process:
    #    1. Extracts the center, size, and angle of the bounding box.
    #    2. Computes a rotation matrix to align the bounding box with the image axes.
    #    3. Rotates the image based on the calculated rotation matrix.
    #    4. Crops the now axis-aligned bounding box from the rotated image.


    # get the parameter of the small rectangle
    center, size, angle = rect[0], rect[1], rect[2]
    center, size = tuple(map(int, center)), tuple(map(int, size))

    # get row and col num in img
    height, width = img.shape[0], img.shape[1]

    # calculate the rotation matrix
    M = cv2.getRotationMatrix2D(center, angle, 1)
    # rotate the original image
    img_rot = cv2.warpAffine(img, M, (width, height), flags=interpolation)

    # now rotated rectangle becomes vertical, and we crop it
    img_crop = cv2.getRectSubPix(img_rot, size, center)

    return img_crop


import cv2
import numpy as np

def prepare_for_ocr(img):
    """
    Processes a cropped rectangle for OCR detection by ensuring the image is wider than tall
    and generating both the original (or rotated) and a 180-degree rotated variant.
    
    Args:
        img (numpy.ndarray): The cropped rectangle image.

    Returns:
        tuple: (processed_img, rotated_180_img)
               - processed_img: Image oriented to be wider than tall.
               - rotated_180_img: 180-degree rotated version of `processed_img`.
    """
    # Get image dimensions
    height, width = img.shape[:2]

    # Rotate 90 degrees clockwise if the image is taller than it is wide
    if height > width:
        img = cv2.rotate(img, cv2.ROTATE_90_CLOCKWISE)

    # Generate the 180-degree rotated image
    rotated_180_img = cv2.rotate(img, cv2.ROTATE_180)

    return img, rotated_180_img

# Save results, rotate and crop images (OBB only)

The main code is again from https://github.com/ultralytics/ultralytics/issues/13650

In [4]:
# Create the output directory, if needed
OUTPUT_DIR = get_next_directory(os.path.join(HOME_DIR, "output/predict"))
os.makedirs(OUTPUT_DIR+"/book", exist_ok=True)

# Get only filename with no directories and no extension
filename = os.path.splitext(os.path.basename(source))[0]

# Process results
with open(OUTPUT_DIR+"/results.json", "w") as text_file:
    for result in results:
        
        if (len(result) > 0):
            result.show()

            print(result.to_json(), file=text_file)

            for idx, obb in enumerate(result.obb.xyxyxyxy):
                points = obb.cpu().numpy().reshape((-1, 1, 2)).astype(int)
                rect = cv2.minAreaRect(points)

                # Rotate the image slightly so that it aligns with the axes.
                img_cropped = crop_rect(result.orig_img, rect)

                # Ensure the image is wider than tall and also return a variant rotated by 180 degrees.
                img, img_rotated_180 = prepare_for_ocr(img_cropped)

                cv2.imwrite(os.path.join(OUTPUT_DIR, "book", f"{filename}_{idx}.jpg"), img)
                cv2.imwrite(os.path.join(OUTPUT_DIR, "book", f"{filename}_rotated-180_{idx}.jpg"), img_rotated_180)

            result.save_txt(OUTPUT_DIR+"/results.txt", save_conf=True)

            print(result.summary())


[{'name': 'book', 'class': 0, 'confidence': 0.92829, 'box': {'x1': 2138.47437, 'y1': 2193.13525, 'x2': 2474.677, 'y2': 2211.39307, 'x3': 2559.83325, 'y3': 643.29034, 'x4': 2223.63062, 'y4': 625.03278}}, {'name': 'book', 'class': 0, 'confidence': 0.9155, 'box': {'x1': 2761.04761, 'y1': 2158.79468, 'x2': 2935.44067, 'y2': 2165.02563, 'x3': 2985.83765, 'y3': 754.52094, 'x4': 2811.44458, 'y4': 748.28986}}, {'name': 'book', 'class': 0, 'confidence': 0.91296, 'box': {'x1': 2932.27344, 'y1': 2170.69312, 'x2': 3104.52686, 'y2': 2176.56909, 'x3': 3153.72607, 'y3': 734.31604, 'x4': 2981.47266, 'y4': 728.43994}}, {'name': 'book', 'class': 0, 'confidence': 0.90544, 'box': {'x1': 2501.48193, 'y1': 2174.44922, 'x2': 2753.79492, 'y2': 2187.65234, 'x3': 2831.42773, 'y3': 704.10999, 'x4': 2579.11475, 'y4': 690.90662}}, {'name': 'book', 'class': 0, 'confidence': 0.8734, 'box': {'x1': 3395.00635, 'y1': 2191.7168, 'x2': 3565.3833, 'y2': 2203.33594, 'x3': 3671.73682, 'y3': 643.80695, 'x4': 3501.35986, 'y4'

# Perform OCR on each book image found

In [None]:
from PIL import Image

import cv2
import numpy as np

import pytesseract
from pytesseract import Output


def detect_text_regions(image, east_model, min_confidence=0.5, nms_threshold=0.8):
    """
    Detects text regions in an image using the EAST text detector with debugging.
    """
    # Load the image and grab its dimensions
    (H, W) = image.shape[:2]

    # Define the EAST model input dimensions
    #newW, newH = 320, 320
    # Alternative: Define the new width and height for the image (must be multiples of 32)
    newW, newH = (W // 32) * 32, (H // 32) * 32
    (rH, rW) = (H / float(newH), W / float(newW))  # Determine scale factors

    # Resize the image to fit the EAST model input
    resized_image = cv2.resize(image, (newW, newH))
    (H, W) = resized_image.shape[:2]

    # Define the two output layer names for the EAST detector model that
    # we are interested -- the first is the output probabilities and the
    # second can be used to derive the bounding box coordinates of text
    layerNames = [
        "feature_fusion/Conv_7/Sigmoid",
        "feature_fusion/concat_3"]

    # Prepare the input blob for the EAST model
    blob = cv2.dnn.blobFromImage(resized_image, 1.0, (W, H),
                                 (123.68, 116.78, 103.94), swapRB=True, crop=False)


    # Perform a forward pass to get scores and geometry
    east_model.setInput(blob)
    (scores, geometry) = east_model.forward(layerNames)

    # Decode predictions
    (detections, confidences) = decodeBoundingBoxes(scores, geometry, min_confidence)

    # Apply non-maxima suppression to suppress weak, overlapping bounding boxes
    indices = cv2.dnn.NMSBoxes(detections, confidences, score_threshold=min_confidence, nms_threshold=nms_threshold)

    if isinstance(indices, tuple):  # Check if indices is a tuple (empty result)
        indices = np.array([])  # Convert it into an empty NumPy array

    # apply morphological operations to merge nearby bounding boxes
    kernel = np.ones((10, 10), np.uint8)
    mask = np.zeros((H, W), dtype=np.uint8)

    # Draw detected boxes onto a mask
    for i in indices.flatten():
        (startX, startY, endX, endY) = detections[i]
        startX = int(startX * rW)
        startY = int(startY * rH)
        endX = int(endX * rW)
        endY = int(endY * rH)
        cv2.rectangle(mask, (startX, startY), (endX, endY), 255, -1)

    # Dilate the mask to merge nearby detections
    mask = cv2.dilate(mask, kernel, iterations=1)

    # Find contours from the merged mask
    contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    # Collect the new bounding boxes
    boxes = []
    for contour in contours:
        x, y, w, h = cv2.boundingRect(contour)
        boxes.append((x, y, x + w, y + h))

    return boxes


def decodeBoundingBoxes(scores, geometry, scoreThresh):

    # ASSERT dimensions and shapes of geometry and scores #
    assert len(scores.shape) == 4, "Incorrect dimensions of scores"
    assert len(geometry.shape) == 4, "Incorrect dimensions of geometry"
    assert scores.shape[0] == 1, "Invalid dimensions of scores"
    assert geometry.shape[0] == 1, "Invalid dimensions of geometry"
    assert scores.shape[1] == 1, "Invalid dimensions of scores"
    assert geometry.shape[1] == 5, "Invalid dimensions of geometry"
    assert scores.shape[2] == geometry.shape[2], "Invalid dimensions of scores and geometry"
    assert scores.shape[3] == geometry.shape[3], "Invalid dimensions of scores and geometry"

    detections = []
    confidences = []

    (numRows, numCols) = scores.shape[2:4]

    for y in range(0, numRows):

        # Extract data from scores. The geometrical data is used to derive 
        # potential bounding box coordinates that surround text.
        scoresData = scores[0, 0, y]
        xData0 = geometry[0, 0, y]
        xData1 = geometry[0, 1, y]
        xData2 = geometry[0, 2, y]
        xData3 = geometry[0, 3, y]
        anglesData = geometry[0, 4, y]

        for x in range(0, numCols):
            score = scoresData[x]

            # If score is lower than threshold score, move to next x
            if (score < scoreThresh):
                continue

            # Calculate offset
            (offsetX, offsetY) = (x * 4.0, y * 4.0)


            # Extract the rotation angle for the prediction and then
            # compute the sin and cosine.
            angle = anglesData[x]
            (cos, sin) = (np.cos(angle), np.sin(angle))

            h = xData0[x] + xData2[x]
            w = xData1[x] + xData3[x]

            # Compute both the starting and ending (x, y)-coordinates for
            # the text prediction bounding box.
            endX = int(offsetX + (cos * xData1[x]) + (sin * xData2[x]))
            endY = int(offsetY - (sin * xData1[x]) + (cos * xData2[x]))
            startX = int(endX - w)
            startY = int(endY - h)
            
            # Add the bounding box coordinates and probability score to
            # our respective lists.
            detections.append((startX, startY, endX, endY))
            confidences.append(scoresData[x])

    # Return detections and confidences
    return (detections, confidences)


def showBoundingBoxes(image, boxes):
    """
    Visualizes the bounding boxes on the input image.
    """

    # Check if vertices are empty
    if boxes is None or len(boxes) == 0:
        print("No bounding boxes to display.")
        return
    
    # Show bounding boxes
    for i, (startX, startY, endX, endY) in enumerate(boxes):

        # Draw the lines
        cv2.rectangle(image, (startX, startY), (endX, endY), color=(0, 255, 0), thickness=2)

        # Add the index label
        text_x, text_y = startX+10, startY+40
        cv2.putText(image, f"{i}", (text_x, text_y),
                    fontFace=cv2.FONT_HERSHEY_SIMPLEX, fontScale=1.2,
                    color=(0, 255, 0), thickness=2)

    # Display the image with bounding boxes
    cv2.imshow("Bounding boxes", image)
    cv2.waitKey(0)
    cv2.destroyAllWindows()


def fourPointsTransform(frame, vertices):
    vertices = np.asarray(vertices)
    outputSize = (100, 32)
    targetVertices = np.array([
        [0, outputSize[1] - 1],
        [0, 0],
        [outputSize[0] - 1, 0],
        [outputSize[0] - 1, outputSize[1] - 1]], dtype="float32")

    rotationMatrix = cv2.getPerspectiveTransform(vertices, targetVertices)
    result = cv2.warpPerspective(frame, rotationMatrix, outputSize)
    return result



def rotate_image(image, angle):
    """
    Rotates an image by the specified angle without cropping.

    Args:
        image (numpy.ndarray): The input image.
        angle (float): The angle to rotate.

    Returns:
        numpy.ndarray: The rotated image.
    """
    (h, w) = image.shape[:2]
    (cx, cy) = (w // 2, h // 2)

    # Compute the rotation matrix
    M = cv2.getRotationMatrix2D((cx, cy), angle, 1.0)
    cos = np.abs(M[0, 0])
    sin = np.abs(M[0, 1])

    # Compute new bounding dimensions
    new_w = int((h * sin) + (w * cos))
    new_h = int((h * cos) + (w * sin))

    # Adjust rotation matrix to account for translation
    M[0, 2] += (new_w / 2) - cx
    M[1, 2] += (new_h / 2) - cy

    # Perform the rotation
    return cv2.warpAffine(image, M, (new_w, new_h))


def ocr_onImage(image_path):
    """
    Perform OCR on an image, forcing horizontal text detection.

    Args:
        image_path (str): Path to the image for OCR.

    Returns:
        str: The OCR-detected text.
    """

    east_model_path = os.path.join(HOME_DIR, "notebooks", "east_text_detection.pb")

    # Open image with Pillow to access DPI metadata
    pil_image = Image.open(image_path)
    dpi = pil_image.info.get("dpi", (72, 72))  # Default to 72 DPI if not present

    # Convert Pillow image to OpenCV format
    image = np.array(pil_image)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

    # --- Detect text regions using EAST, correct orientation, and perform OCR. ---

    # Load the pre-trained EAST model
    print("[INFO] loading EAST text detector...")
    east_model = cv2.dnn.readNet(east_model_path)

    # Detect text regions using EAST
    boxes = detect_text_regions(image, east_model)

    # Visualize all bounding boxes found.
    showBoundingBoxes(image.copy(), boxes)

    # Load the image
    ocr_results = {}

                
    # # Get cropped image using perspective transform
    # cropped_images = []
    # cropped_image = fourPointsTransform(image, boxes)
    # cropped_image = cv2.cvtColor(cropped_image, cv2.COLOR_BGR2GRAY)

    # cropped_images.append(cropped_image)

    # # Process each detected region
    # for i, imageWithText in enumerate(cropped_images):

    #     cv2.imshow(f"Region {i}", imageWithText)
    #     cv2.waitKey(0)

    #     # Perform OCR on the corrected region
    #     ocr_text = pytesseract.image_to_string(imageWithText, config="--psm 6")
    #     ocr_results[f"text_region_{i}"] = ocr_text.strip()

    return ocr_results
    

# get grayscale image
def grayscale(image):
    return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

# noise removal
def denoise(image):
    return cv2.medianBlur(image, 5)

# thresholding
def thresholding(image):
    return cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]


In [None]:
import cv2
import pandas as pd

#import easyocr
import pytesseract

# --- Needs tesseract on the path. I've installed it via homebrew.

# Get only filename with no directories and no extension
filename = os.path.splitext(os.path.basename(source))[0]

for result in results:

    if (len(result) > 0):
        # for object detection and instance separation
        # i=1

        # for OBB
        i=0

        for detection in result.summary(): 
            if (detection['name'] == 'book'):
                print(f"Book {i} found")

                # for object detection and instance separation
                # image_filename = f"image{i}.jpg" if i>1 else 'image.jpg'
                # image_path = OUTPUT_DIR + '/book/' + image_filename
                
                # for OBB
                # Perform OCR on all (both) image variants.
                image_variants = [
                    f"{filename}_{i}.jpg",  # Original image
                    f"{filename}_rotated-180_{i}.jpg"  # 180-degree rotated image
                ]

                # Iterate over each variant, process the OCR, and print the result
                for variant_filename in image_variants:
                    img_path = os.path.join(OUTPUT_DIR, "book", variant_filename)
                    detected_texts = ocr_onImage(img_path)

                    # Display OCR results
                    print(f"{img_path} ->")
                    for region, text in detected_texts.items():
                        print(f"    {region}: {text}")

                i += 1
            else:
                print("Skipping", detection['name'], '...')

Book 0 found
[INFO] loading EAST text detector...


2025-02-16 22:12:17.498 python[96051:763322] +[IMKClient subclass]: chose IMKClient_Modern
2025-02-16 22:12:17.498 python[96051:763322] +[IMKInputSession subclass]: chose IMKInputSession_Modern


/Users/andreas/Documents/Projekte/Objekterkennung.yolo11/output/predict41/book/books_00005_0.jpg ->
[INFO] loading EAST text detector...
/Users/andreas/Documents/Projekte/Objekterkennung.yolo11/output/predict41/book/books_00005_rotated-180_0.jpg ->
Book 1 found
[INFO] loading EAST text detector...
/Users/andreas/Documents/Projekte/Objekterkennung.yolo11/output/predict41/book/books_00005_1.jpg ->
[INFO] loading EAST text detector...
/Users/andreas/Documents/Projekte/Objekterkennung.yolo11/output/predict41/book/books_00005_rotated-180_1.jpg ->
Book 2 found
[INFO] loading EAST text detector...
/Users/andreas/Documents/Projekte/Objekterkennung.yolo11/output/predict41/book/books_00005_2.jpg ->
[INFO] loading EAST text detector...
/Users/andreas/Documents/Projekte/Objekterkennung.yolo11/output/predict41/book/books_00005_rotated-180_2.jpg ->
Book 3 found
[INFO] loading EAST text detector...
/Users/andreas/Documents/Projekte/Objekterkennung.yolo11/output/predict41/book/books_00005_3.jpg ->
[IN

: 