In [1]:
import os

NOTEBOOKS_DIR = os.getcwd()
HOME_DIR = os.path.dirname(NOTEBOOKS_DIR)

def get_next_directory(base_path="output/predict"):
    # Check if the base directory exists
    if not os.path.exists(base_path):
        return base_path
    else:
        # Find the next available numbered directory
        i = 2
        while os.path.exists(f"{base_path}{i}"):
            i += 1
        return f"{base_path}{i}"

# Perform object detection

Perform the inference with the loaded model.

In [2]:
from ultralytics import YOLO

# Load a model
#model = YOLO("yolo11s.pt", )  # load an official model
# model = YOLO("yolo11s-seg.pt")  # load an official model (instance segmentation)
model = YOLO(HOME_DIR+"/runs/obb/train/weights/best.pt")  # load an official model (Oriented Bounding Boxes Object Detection)
#model = YOLO(HOME_DIR+"/runs/segment/train/weights/best.pt")  # load my custom model

# source = 'https://ultralytics.com/images/bus.jpg'
# source = HOME_DIR+'/example-files/IMG_3688.png'
# source = HOME_DIR+'/example-files/books'
# source = HOME_DIR+'/example-files/books.mov'
source = HOME_DIR+'/example-files/books/books_00005.png'

# Predict with the model
results = model.predict(source, conf=0.5)  


image 1/1 /Users/andreas/Documents/Projekte/Objekterkennung.yolo11/example-files/books/books_00005.png: 480x640 87.4ms
Speed: 1.9ms preprocess, 87.4ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)


# Adjust images (OBB only)

From: https://github.com/ultralytics/ultralytics/issues/9344

Switching to an OBB (oriented bounding box) model means you'll be working with rotated bounding boxes. The .boxes attribute for an OBB model will contain the center coordinates, width, height, and angle in radians.

I took the following crop_rect function from here, as it also rotates the image: https://github.com/ultralytics/ultralytics/issues/13650

In [None]:
import cv2
from ultralytics import YOLO
import numpy as np
import os

def crop_rect(img, rect, interpolation=cv2.INTER_CUBIC):
    """
    Extracts and rectifies a rotated bounding box from an image.

    This function takes an image and an oriented bounding box (OBB), rotates the 
    image such that the bounding box becomes axis-aligned (rectangular), and then 
    crops the bounding box area.

    Args:
        img (numpy.ndarray): The input image from which the bounding box is extracted.
        rect (tuple): The oriented bounding box parameters.
                     - rect[0]: Center coordinates of the bounding box (x, y).
                     - rect[1]: Size of the bounding box (width, height).
                     - rect[2]: Rotation angle of the bounding box (in degrees).
        interpolation (int, optional): Interpolation method used when rotating the image.
                                       Defaults to cv2.INTER_CUBIC.

    Returns:
        cropped_image (numpy.ndarray): The cropped rectangle region from the rotated image.
    """

    # Process:
    #    1. Extracts the center, size, and angle of the bounding box.
    #    2. Computes a rotation matrix to align the bounding box with the image axes.
    #    3. Rotates the image based on the calculated rotation matrix.
    #    4. Crops the now axis-aligned bounding box from the rotated image.


    # get the parameter of the small rectangle
    center, size, angle = rect[0], rect[1], rect[2]
    center, size = tuple(map(int, center)), tuple(map(int, size))

    # get row and col num in img
    height, width = img.shape[0], img.shape[1]

    # calculate the rotation matrix
    M = cv2.getRotationMatrix2D(center, angle, 1)
    # rotate the original image
    img_rot = cv2.warpAffine(img, M, (width, height), flags=interpolation)

    # now rotated rectangle becomes vertical, and we crop it
    img_crop = cv2.getRectSubPix(img_rot, size, center)

    return img_crop


import cv2
import numpy as np

def prepare_for_ocr(img):
    """
    Processes a cropped rectangle for OCR detection by ensuring the image is wider than tall
    and generating both the original (or rotated) and a 180-degree rotated variant.
    
    Args:
        img (numpy.ndarray): The cropped rectangle image.

    Returns:
        tuple: (processed_img, rotated_180_img)
               - processed_img: Image oriented to be wider than tall.
               - rotated_180_img: 180-degree rotated version of `processed_img`.
    """
    # Get image dimensions
    height, width = img.shape[:2]

    # Rotate 90 degrees clockwise if the image is taller than it is wide
    if height > width:
        img = cv2.rotate(img, cv2.ROTATE_90_CLOCKWISE)

    # Generate the 180-degree rotated image
    rotated_180_img = cv2.rotate(img, cv2.ROTATE_180)

    return img, rotated_180_img

# Save results, rotate and crop images (OBB only)

The main code is again from https://github.com/ultralytics/ultralytics/issues/13650

In [None]:
# Create the output directory, if needed
OUTPUT_DIR = get_next_directory(os.path.join(HOME_DIR, "output/predict"))
os.makedirs(OUTPUT_DIR+"/book", exist_ok=True)

# Get only filename with no directories and no extension
filename = os.path.splitext(os.path.basename(source))[0]

# Process results
with open(OUTPUT_DIR+"/results.json", "w") as text_file:
    for result in results:
        
        if (len(result) > 0):
            result.show()

            print(result.to_json(), file=text_file)

            for idx, obb in enumerate(result.obb.xyxyxyxy):
                points = obb.cpu().numpy().reshape((-1, 1, 2)).astype(int)
                rect = cv2.minAreaRect(points)

                # Rotate the image slightly so that it aligns with the axes.
                img_cropped = crop_rect(result.orig_img, rect)

                # Ensure the image is wider than tall and also return a variant rotated by 180 degrees.
                img, img_rotated_180 = prepare_for_ocr(img_cropped)

                cv2.imwrite(os.path.join(OUTPUT_DIR, "book", f"{filename}_{idx}.jpg"), img)
                cv2.imwrite(os.path.join(OUTPUT_DIR, "book", f"{filename}_rotated-180_{idx}.jpg"), img_rotated_180)

            result.save_txt(OUTPUT_DIR+"/results.txt", save_conf=True)

            print(result.summary())


[{'name': 'book', 'class': 0, 'confidence': 0.92829, 'box': {'x1': 2138.47437, 'y1': 2193.1355, 'x2': 2474.677, 'y2': 2211.39282, 'x3': 2559.83325, 'y3': 643.29034, 'x4': 2223.63062, 'y4': 625.03278}}, {'name': 'book', 'class': 0, 'confidence': 0.9155, 'box': {'x1': 2761.04785, 'y1': 2158.79468, 'x2': 2935.44092, 'y2': 2165.02563, 'x3': 2985.83789, 'y3': 754.52069, 'x4': 2811.44482, 'y4': 748.28961}}, {'name': 'book', 'class': 0, 'confidence': 0.91296, 'box': {'x1': 2932.27344, 'y1': 2170.69287, 'x2': 3104.52686, 'y2': 2176.56885, 'x3': 3153.72607, 'y3': 734.31586, 'x4': 2981.47266, 'y4': 728.43988}}, {'name': 'book', 'class': 0, 'confidence': 0.90544, 'box': {'x1': 2501.48193, 'y1': 2174.44922, 'x2': 2753.79492, 'y2': 2187.65234, 'x3': 2831.42773, 'y3': 704.11011, 'x4': 2579.11475, 'y4': 690.90674}}, {'name': 'book', 'class': 0, 'confidence': 0.8734, 'box': {'x1': 3395.00659, 'y1': 2191.71704, 'x2': 3565.38354, 'y2': 2203.33618, 'x3': 3671.73657, 'y3': 643.80688, 'x4': 3501.35962, 'y4

# Save results and cropped images

Saves a JSON file with all results (includes class and confidence). Saves images for each bounding box.

In [None]:
# Create the output directory, if needed
OUTPUT_DIR = get_next_directory(os.path.join(HOME_DIR, "output/predict"))
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Process results
with open(OUTPUT_DIR+"/results.json", "w") as text_file:
    for result in results:
        
        if (len(result) > 0):
            result.show()

            print(result.to_json(), file=text_file)
            result.save_crop(OUTPUT_DIR, "image")
            result.save_txt(OUTPUT_DIR+"/results.txt", save_conf=True)

            print(result.summary())

# Perform OCR on each book image found

In [None]:
import cv2
import pytesseract

def ocr_onImage(img_path):
    
    image = cv2.imread(img_path)
    
    # --- Perform OCR using EasyOCR
    #reader = easyocr.Reader(['de'])  # Choose language
    #ocr_text = reader.readtext(cropped_image, rotation_info=[90,180,270], paragraph=True, width_ths=0.7, detail=0)

    # --- Perform OCR using Tesseract

    # Do some image processing first.
    #img = grayscale(rotated_image)
    #img = thresholding(img)
    #img = denoise(img)

    return ocr_core(image)

def ocr_core(img):
    text = pytesseract.image_to_string(img)
    return text

# get grayscale image
def grayscale(image):
    return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

# noise removal
def denoise(image):
    return cv2.medianBlur(image, 5)

# thresholding
def thresholding(image):
    return cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]


In [None]:
import cv2

#import easyocr
import pytesseract

# --- Needs tesseract on the path. I've installed it via homebrew.

# Get only filename with no directories and no extension
filename = os.path.splitext(os.path.basename(source))[0]

for result in results:

    if (len(result) > 0):
        # for object detection and instance separation
        # i=1

        # for OBB
        i=0

        for detection in result.summary(): 
            if (detection['name'] == 'book'):
                print(f"Book {i} found")

                # for object detection and instance separation
                # image_filename = f"image{i}.jpg" if i>1 else 'image.jpg'
                # image_path = OUTPUT_DIR + '/book/' + image_filename
                
                # for OBB
                # Perform OCR on all (both) image variants.
                image_variants = [
                    f"{filename}_{i}.jpg",  # Original image
                    f"{filename}_rotated-180_{i}.jpg"  # 180-degree rotated image
                ]

                # Iterate over each variant, process the OCR, and print the result
                for variant_filename in image_variants:
                    img_path = os.path.join(OUTPUT_DIR, "book", variant_filename)
                    ocr_text = ocr_onImage(img_path)
                    print(f"{img_path} -> {ocr_text}")

                i += 1
            else:
                print("Skipping", detection['name'], '...')

Book 0 found
Image path: /Users/andreas/Documents/Projekte/Objekterkennung.yolo11/output/predict/book/books_00005_crop_0.jpg
->  FUR VERNUNFT,
WISSENSCHAFT,

HUMANISMUS
UND FORTSCHRITE


Book 1 found
Image path: /Users/andreas/Documents/Projekte/Objekterkennung.yolo11/output/predict/book/books_00005_crop_1.jpg
->  HANSI


Book 2 found
Image path: /Users/andreas/Documents/Projekte/Objekterkennung.yolo11/output/predict/book/books_00005_crop_2.jpg
->  RONJA VON RONNE | ENDE IN SICHT

Book 3 found
Image path: /Users/andreas/Documents/Projekte/Objekterkennung.yolo11/output/predict/book/books_00005_crop_3.jpg
->  Jaron Lanier Wem gehdrt die Zukunft?
eo a - - ea


Book 4 found
Image path: /Users/andreas/Documents/Projekte/Objekterkennung.yolo11/output/predict/book/books_00005_crop_4.jpg
->  |e ANGELIKA A WALDIS « ICH KOMME Mit

FOOT et A

Book 5 found
Image path: /Users/andreas/Documents/Projekte/Objekterkennung.yolo11/output/predict/book/books_00005_crop_5.jpg
->  | ® SATOSHI YAGISAWA DIE TA