# **Breaking CAPTCHAS with PyTorch**

In [12]:
# Imports
import os, glob
import numpy as np
import cv2

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from lab_2_helpers import *

In [13]:
# 2. Extract dataset
!tar -xf captcha-images.tar.xz
!ls

captcha-images	       lab_2_helpers.py  sample_data
captcha-images.tar.xz  __pycache__


In [14]:
# 3. Core preprocessing functions
def load_transform_image(image_path: str):
    # 1) read
    image = cv2.imread(image_path)
    if image is None:
        raise FileNotFoundError(f"Could not read image: {image_path}")

    # 2) grayscale
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # 3) padding
    gray = cv2.copyMakeBorder(gray, 8, 8, 8, 8, cv2.BORDER_REPLICATE)
    return gray

def extract_captcha_text(image_path: str) -> str:
    base = os.path.basename(image_path)
    text = os.path.splitext(base)[0]
    return text

def extract_chars(gray_image):
    """ Find contours and extract characters inside each CAPTCHA. """
    # Threshold image and convert it to black-white
    image_bw = cv2.threshold(gray_image, 0, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)[1]
    # Find contours (continuous blobs of pixels) the image
    contours = cv2.findContours(image_bw, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)[0]

    char_regions = []
    # Loop through each contour
    for contour in contours:
        # Get the rectangle that contains the contour
        x, y, w, h = cv2.boundingRect(contour)

        # Compare the width and height of the bounding box,
        # detect if there are letters conjoined into one chunk
        if w / h > 1.25:
            # Bounding box is too wide for a single character
            # Split it in half into two letter regions
            half_width = int(w / 2)
            char_regions.append((x, y, half_width, h))
            char_regions.append((x + half_width, y, half_width, h))
        else:
            # Only a single letter in contour
            char_regions.append((x, y, w, h))

    # Ignore image if less or more than 4 regions detected
    if len(char_regions)!=4:
        return None
    # Sort regions by their X coordinates
    char_regions.sort(key=lambda x: x[0])

    # Character images
    char_images = []
    # Save each character as a single image
    for x, y, w, h in char_regions:
        # Extract character from image with 2px margin
        char_image = gray_image[y - 2:y + h + 2, x - 2:x + w + 2]
        # Save character images
        char_images.append(char_image)

    # Return character images
    return char_images

def make_feature(char_image, width=20, height=20):
    resized = resize_to_fit(char_image, width, height)
    resized = resized.astype("float32") / 255.0
    resized = np.expand_dims(resized, axis=-1)
    return resized

In [15]:
# 4. individual character dataset from CAPTCHAs
image_paths = sorted(glob.glob("./captcha-images/*.png"))
print("Found images:", len(image_paths))

X_chars = []
y_chars = []
captcha_images = []
captcha_texts = []

for path in image_paths:
    gray = load_transform_image(path)
    text = extract_captcha_text(path)

    # keep original CAPTCHA image for later visualization/eval
    captcha_images.append(gray)
    captcha_texts.append(text)

    # segment characters
    chars = extract_chars(gray)
    if chars is None or len(chars) != 4:
        continue

    # build per-character samples
    for char_img, char_label in zip(chars, text):
        X_chars.append(make_feature(char_img))
        y_chars.append(char_label)

X_chars = np.array(X_chars)
y_chars = np.array(y_chars)
print("Character samples:", X_chars.shape, y_chars.shape)

Found images: 1136
Character samples: (4468, 20, 20, 1) (4468,)


In [16]:
# 5. train/test split, label encoding, transpose
X_train_np, X_test_np, y_train_text, y_test_text = train_test_split(
    X_chars, y_chars, test_size=0.2, random_state=42, stratify=y_chars
)

le = LabelEncoder()
y_train = le.fit_transform(y_train_text)
y_test  = le.transform(y_test_text)

num_classes = len(le.classes_)
print("num_classes:", num_classes)

# NHWC -> NCHW
X_train = np.transpose(X_train_np, (0, 3, 1, 2)).astype(np.float32)
X_test  = np.transpose(X_test_np,  (0, 3, 1, 2)).astype(np.float32)

num_classes: 32


In [17]:
# 6. dataset + dataloader
class CaptchaCharDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.from_numpy(X)
        self.y = torch.from_numpy(y).long()

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

batch_size = 64
train_loader = DataLoader(CaptchaCharDataset(X_train, y_train), batch_size=batch_size, shuffle=True)
test_loader  = DataLoader(CaptchaCharDataset(X_test,  y_test),  batch_size=batch_size, shuffle=False)

In [18]:
# 7. pytorch cnn
class SimpleCNN(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv2d(1, 8, kernel_size=5, padding=2),
            nn.ReLU(),
            nn.MaxPool2d(2),

            nn.Conv2d(8, 16, kernel_size=5, padding=2),
            nn.ReLU(),
            nn.MaxPool2d(2),
        )
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(16 * 5 * 5, 64),
            nn.ReLU(),
            nn.Linear(64, num_classes),
        )

    def forward(self, x):
        x = self.features(x)
        return self.classifier(x)

In [19]:
# 8. train + eval loops
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SimpleCNN(num_classes).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

def run_epoch(model, loader, train=True):
    model.train() if train else model.eval()
    total_loss, total_correct, total = 0.0, 0, 0

    for Xb, yb in loader:
        Xb, yb = Xb.to(device), yb.to(device)

        if train:
            optimizer.zero_grad()

        logits = model(Xb)
        loss = criterion(logits, yb)

        if train:
            loss.backward()
            optimizer.step()

        total_loss += loss.item() * Xb.size(0)
        preds = logits.argmax(dim=1)
        total_correct += (preds == yb).sum().item()
        total += Xb.size(0)

    return total_loss / total, total_correct / total

epochs = 10
for e in range(1, epochs + 1):
    tr_loss, tr_acc = run_epoch(model, train_loader, train=True)
    te_loss, te_acc = run_epoch(model, test_loader,  train=False)
    print(f"Epoch {e:02d} | train {tr_loss:.4f} acc {tr_acc:.4f} | test {te_loss:.4f} acc {te_acc:.4f}")

Epoch 01 | train 3.3517 acc 0.0853 | test 3.0606 acc 0.1544
Epoch 02 | train 2.0390 acc 0.4597 | test 0.9971 acc 0.7047
Epoch 03 | train 0.6520 acc 0.8391 | test 0.4496 acc 0.9027
Epoch 04 | train 0.3218 acc 0.9379 | test 0.2663 acc 0.9463
Epoch 05 | train 0.2002 acc 0.9670 | test 0.1741 acc 0.9709
Epoch 06 | train 0.1438 acc 0.9751 | test 0.1465 acc 0.9743
Epoch 07 | train 0.1062 acc 0.9824 | test 0.1133 acc 0.9799
Epoch 08 | train 0.0783 acc 0.9874 | test 0.0988 acc 0.9821
Epoch 09 | train 0.0642 acc 0.9888 | test 0.0906 acc 0.9821
Epoch 10 | train 0.0508 acc 0.9899 | test 0.0892 acc 0.9776
