In [None]:
import cv2


In [11]:
# 1. Constants (Where are the image files?)
# TRAIN_CHARACTERS = "../data/train_cleaned_characters"
# TEST_CHARACTERS = "../data/test_cleaned_characters"

TRAIN_CHARACTERS_RESIZED = "../data/train_cleaned_characters_resized"
TEST_CHARACTERS_RESIZED = "../data/test_cleaned_characters_resized"

In [12]:
# 2. Run KNN on training set (character recognition)
def convert_image_to_array(image):
    return image.flatten()

from sklearn.neighbors import KNeighborsClassifier
import numpy as np

def load_images_from_folder(folder):
    import os
    import cv2

    images = []
    labels = []

    for idx, subfolder in enumerate(os.listdir(folder)):
        for filename in os.listdir(os.path.join(folder, subfolder)):
            img = cv2.imread(os.path.join(folder, subfolder, filename), cv2.IMREAD_GRAYSCALE)
            if img is not None:
                images.append(convert_image_to_array(img))
                labels.append(subfolder)

    return np.array(images), np.array(labels)

X_train, y_train = load_images_from_folder(TRAIN_CHARACTERS_RESIZED)
X_test, y_test = load_images_from_folder(TEST_CHARACTERS_RESIZED)

knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)






In [None]:
# 4. Test KNN with test set. Find accuracy.
from sklearn.metrics import f1_score
y_pred = knn.predict(X_test)


Accuracy:  0.6806135326667314
F1 Score:  0.6901667679477549


In [15]:
accuracy = np.mean(y_pred == y_test)
print("Accuracy: ", accuracy)
print("F1 Score: ", f1_score(y_test, y_pred, average='weighted'))

Accuracy:  0.6806135326667314
F1 Score:  0.6901667679477549


In [30]:
# 5. Actually read captchas!

# Helper functions
from collections import Counter
import os
import random
import cv2
import multiprocessing
import concurrent.futures
import string
import sys
import numpy as np

sys.setrecursionlimit(10**6)


def color_letters_black(img):
    img = img.copy()

    for i, row in enumerate(img):
        for j, pixel in enumerate(row):
            if not (pixel == [255, 255, 255]).all():
                img[i][j] = [0, 0, 0]
            else:
                img[i][j] = [255, 255, 255]
    return img


def merge_boxes(box1, box2):
    return [min(box1[0], box2[0]), max(box1[1], box2[1]), min(box1[2], box2[2]), max(box1[3], box2[3]), box1[4]]

def is_surrounded_horizontally(box1, box2, atol=10)-> bool:
    """return box1 horizontally is inside box2"""
    return box1[2] >= box2[2] - atol and box1[3] <= box2[3] + atol

def is_tiny_box(box):
    return (abs(box[1] - box[0]) * abs(box[3] - box[2])) < 80

def generate_random_color():
    return [random.randint(0, 255) for _ in range(3)]

def partition_box(box, n=2):
    # print(type(box[3]))
    # print(box[3], box[2], n, (box[3] - box[2]) // n)
    part_length = int((box[3] - box[2]) // n)
    # print(part_length)
    return [[box[0], box[1], box[2] + i * part_length, box[2] + (i + 1) * part_length] for i in range(n)]

def pixels_are_close(pixel1, pixel2, atol=20):
    """
    Determines if two RGB pixels are close in color, allowing for slight variations.
    
    Uses Euclidean distance in the RGB color space.
    
    Args:
        pixel1 (array-like): First pixel (e.g., [R, G, B]).
        pixel2 (array-like): Second pixel (e.g., [R, G, B]).
        atol (int): Acceptable color difference threshold.
        
    Returns:
        bool: True if the pixels are similar, False otherwise.
    """
    return np.linalg.norm(np.array(pixel1) - np.array(pixel2)) < atol

def merge_all_boxes(bounding_boxes, close_threshold=30, debug=False):
    merged_boxes = []
    for box in bounding_boxes:
        if not merged_boxes:
            merged_boxes.append(box)
        else:
            if is_surrounded_horizontally(box, merged_boxes[-1]) or is_surrounded_horizontally(merged_boxes[-1], box):
                # print("surrounded", abs(box[1] - box[0]) * abs(box[3] - box[2]))
                merged_boxes[-1] = merge_boxes(merged_boxes[-1], box)
                # print("tiny", abs(box[1] - box[0]) * abs(box[3] - box[2]))
                # merged_boxes[-1] = merge_boxes(merged_boxes[-1], box)
                # Note: we don't append tiny boxes for now, seem to be causing the effect
                # pass
            elif pixels_are_close(box[4], merged_boxes[-1][4]) and box[2] - merged_boxes[-1][3] < close_threshold:
                if debug:
                    print("close", box[4], merged_boxes[-1][4])
                merged_boxes[-1] = merge_boxes(merged_boxes[-1], box)
            elif (is_tiny_box(box) or is_tiny_box(merged_boxes[-1])) and box[2] - merged_boxes[-1][3] < close_threshold:
                pass
            else:
                if debug:
                    print("not close", box[4], merged_boxes[-1][4], np.sum(np.abs(box[4] - merged_boxes[-1][4])))
                # if pixels_are_close(box[4], merged_boxes[-1][4]):
                    # print(box, merged_boxes[-1], box[2] - merged_boxes[-1][3], "whats going on")
               
                # print("not tiny", abs(box[1] - box[0]) * abs(box[3] - box[2]))
                merged_boxes.append(box)
    return merged_boxes

def get_bounding_boxes(img):
    # Assume this is the cleaned black letter image

    img = img.copy()
    img = cv2.GaussianBlur(img, (5, 5), 0)
    img = img.astype("int32")

    shape = img.shape[:2]
    visited = np.zeros(shape, dtype=bool)
    bounding_boxes = [] # top bottom left right color

    def dfs(i, j, color):
        if i < 0 or j < 0 or i >= shape[0] or j >= shape[1] or visited[i][j]:
            return
        if pixels_are_close(img[i][j], [255, 255, 255]):
            return
        # if color is not close enough return
        if not pixels_are_close(img[i][j], color):
            return  
        visited[i][j] = True
        bounding_boxes[-1][0] = min(bounding_boxes[-1][0], i)
        bounding_boxes[-1][1] = max(bounding_boxes[-1][1], i)
        bounding_boxes[-1][2] = min(bounding_boxes[-1][2], j)
        bounding_boxes[-1][3] = max(bounding_boxes[-1][3], j)

        # color = img[i][j]
        dfs(i + 1, j + 1, color)
        dfs(i + 1, j, color)
        dfs(i + 1, j - 1, color)

        dfs(i - 1, j - 1, color)
        dfs(i - 1, j, color)
        dfs(i - 1, j + 1, color)
       
        dfs(i, j + 1, color)
        dfs(i, j - 1, color)

    for i in range(shape[0]):
        for j in range(shape[1]):
            if visited[i][j] or pixels_are_close(img[i][j], [255, 255, 255]):
                continue
            bounding_boxes.append([i, i, j, j, img[i][j]])
            dfs(i, j, img[i][j])

    bounding_boxes = [box for box in bounding_boxes if box[0] != box[1] and box[2] != box[3]]
    
    # Merge boxes
    bounding_boxes.sort(key=lambda x: x[2])
    close_threshold = 8 #0.025 * shape[1]
    # print("close threshold", int(close_threshold))
    merged_boxes = merge_all_boxes(merge_all_boxes(bounding_boxes, close_threshold=close_threshold), close_threshold=close_threshold)
    

    # Split boxes
    horizontal_lengths = [abs(box[3] - box[2]) for box in merged_boxes]
    
    # print(len(merged_boxes))
    for box in merged_boxes:
        # print(box)
        cv2.rectangle(img, (box[2], box[0]), (box[3], box[1]), [255, 0, 0], 1)
        # print(box)

    return img, merged_boxes

def resize_with_padding(image, target_width, target_height):
    """
    Resizes an image while maintaining aspect ratio, 
    scaling it to fit within target dimensions, and padding the rest with white.

    Parameters:
        image (numpy.ndarray): Input image.
        target_width (int): Target width.
        target_height (int): Target height.

    Returns:
        numpy.ndarray: Resized and padded image.
    """
    h, w = image.shape[:2]
    
    # Compute scaling factor to fit within the target size
    scale = min(target_width / w, target_height / h)
    new_w = int(w * scale)
    new_h = int(h * scale)

    # Resize image
    resized = cv2.resize(image, (new_w, new_h), interpolation=cv2.INTER_AREA)

    # Create a white canvas
    result = np.ones((target_height, target_width), dtype=np.uint8) * 255

    # Compute top-left corner for centering
    x_offset = (target_width - new_w) // 2
    y_offset = (target_height - new_h) // 2

    # Place resized image onto the white canvas
    result[y_offset:y_offset+new_h, x_offset:x_offset+new_w] = resized

    return result

def get_captcha_from_image(img):
    # get bounding boxes
    _, boxes = get_bounding_boxes(img)

    # get letters - 169 x 78
    letters = [img[box[0]:box[1], box[2]:box[3]] for box in boxes]
    letters = [resize_with_padding(letter, 169, 78) for letter in letters]


    # get predictions
    predictions = knn.predict([convert_image_to_array(letter) for letter in letters])
    return len(boxes), "".join(predictions)




In [51]:
def read_captcha_from_image(img_path):
    img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
    # img = color_letters_black(img)
    return get_captcha_from_image(img)

TEST_FOLDER = "../data/test_cleaned_black_resized"
def get_random_image_path():
    import os
    import random
    img_path = os.path.join(TEST_FOLDER, random.choice(os.listdir(TEST_FOLDER)))
    return img_path

path = get_random_image_path()
print(path)
print(read_captcha_from_image(path))

# u22w <-- completely cmi 1 box found

# segmentation = 0.9
# a single letter 0.7
# 4 - 8 characters long
# 0.9 * 0.7**6 =

# 0.01
# 0.223
# 0.375

../data/test_cleaned_black_resized/wcjoxp-0.png
(6, 'wcjoxp')


In [None]:

import os
import multiprocessing
from tqdm import tqdm

def process_image(image):
    """Function to process a single image and return (correct, total)."""
    path = os.path.join(TEST_FOLDER, image)
    boxes, guess = read_captcha_from_image(path)
    correct_answer = image.split('-')[0]
    return (1, 1) if guess == correct_answer else (0, 1)

if __name__ == "__main__":
    images = os.listdir(TEST_FOLDER)

    with multiprocessing.Pool(processes=multiprocessing.cpu_count()) as pool:
        results = list(tqdm(pool.imap(process_image, images), total=len(images)))

    correct = sum(result[0] for result in results)
    total = sum(result[1] for result in results)

    print("Accuracy:", correct / total)


    

  0%|          | 0/1968 [00:00<?, ?it/s]Process SpawnPoolWorker-8:
Process SpawnPoolWorker-2:
Process SpawnPoolWorker-6:
Process SpawnPoolWorker-1:
Process SpawnPoolWorker-7:
Process SpawnPoolWorker-4:
Process SpawnPoolWorker-3:
Process SpawnPoolWorker-5:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/Users/seelengxd/.local/share/uv/python/cpython-3.12.8-macos-aarch64-none/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/Users/seelengxd/.local/share/uv/python/cpython-3.12.8-macos-aarch64-none/lib/python3.12/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/seelengxd/.local/share/uv/python/cpython-3.12.8-macos-aarch64-none/lib/python3.12/multiprocessi