# Captcha Image Recgonition

In [35]:
import glob
import os
import numpy as np
import matplotlib.pyplot as plt

from pathlib import Path
from collections import Counter

import albumentations
import torch
import torch.nn as nn

from PIL import Image
from PIL import ImageFile

from sklearn import preprocessing 
from sklearn import model_selection
from sklearn import metrics

ImageFile.LOAD_TRUNCATED_IMAGE = True

## Data

In [2]:
data_dir = Path("./data/")

# Get list of all the images
images = sorted(list(map(str, list(data_dir.glob("*.png")))))
labels = [img.split(os.path.sep)[-1].split(".png")[0] for img in images]
characters = set(char for label in labels for char in label)
print("Number of images found: ", len(images))
print("Number of labels found: ", len(labels))
print("Number of unique characters: ", len(characters))
print("Characters present: ", characters)

Number of images found:  1040
Number of labels found:  1040
Number of unique characters:  19
Characters present:  {'b', 'g', 'y', 'f', '3', '2', 'e', '8', '4', 'm', 'w', 'p', 'd', 'x', '7', '5', '6', 'n', 'c'}


## Configurations 

In [16]:
image_height =  300
image_width =  75
number_workers =  8
batch_size = 8
epochs = 200
dev = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

## Preprocessing

In [31]:
class ClassificationDataset:
    def __init__(self, image_paths, targets, resize=None):
        # resize = (height, width)
        self.image_paths = image_paths
        self.targets = targets
        self.resize = resize

        mean = (0.485, 0.456, 0.406)
        std = (0.229, 0.224, 0.225)
        self.aug = albumentations.Compose(
            [
                albumentations.Normalize(
                    mean, std, max_pixel_value=255.0, 
                )
            ]
        )

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, item):
        image = Image.open(self.image_paths[item]).convert("RGB")
        targets = self.targets[item]

        if self.resize is not None:
            image = image.resize(
                (self.resize[1], self.resize[0]), resample=Image.BILINEAR
            )

        image = np.array(image)
        augmented = self.aug(image=image)
        image = augmented["image"]
        image = np.transpose(image, (2, 0, 1)).astype(np.float32)

        return {
            "images": torch.tensor(image, dtype=torch.float),
            "targets": torch.tensor(targets, dtype=torch.long),
        }

## Model

In [36]:
class CaptchaModel(nn.Module):
    def __init__(self, num_chars):
        super(CaptchaModel, self).__init__()
        self.conv_1 = nn.Conv2d(3, 128, kernel_size=(3, 6), padding=(1, 1))
        self.pool_1 = nn.MaxPool2d(kernel_size=(2, 2))
        self.conv_2 = nn.Conv2d(128, 64, kernel_size=(3, 6), padding=(1, 1))
        self.pool_2 = nn.MaxPool2d(kernel_size=(2, 2))
        self.linear_1 = nn.Linear(1152, 64)
        self.drop_1 = nn.Dropout(0.2)
        self.lstm = nn.GRU(64, 32, bidirectional=True, num_layers=2, dropout=0.25)
        self.output = nn.Linear(64, num_chars + 1)

    def forward(self, images, targets=None):
        bs, _, _, _ = images.size()
        x = F.relu(self.conv_1(images))
        x = self.pool_1(x)
        x = F.relu(self.conv_2(x))
        x = self.pool_2(x)
        x = x.permute(0, 3, 1, 2)
        x = x.view(bs, x.size(1), -1)
        x = F.relu(self.linear_1(x))
        x = self.drop_1(x)
        x, _ = self.lstm(x)
        x = self.output(x)
        x = x.permute(1, 0, 2)

        if targets is not None:
            log_probs = F.log_softmax(x, 2)
            input_lengths = torch.full(
                size=(bs,), fill_value=log_probs.size(0), dtype=torch.int32
            )
            target_lengths = torch.full(
                size=(bs,), fill_value=targets.size(1), dtype=torch.int32
            )
            loss = nn.CTCLoss(blank=0)(
                log_probs, targets, input_lengths, target_lengths
            )
            return x, loss

        return x, None

In [41]:
model = CaptchaModel(num_chars=len(characters))
model

CaptchaModel(
  (conv_1): Conv2d(3, 128, kernel_size=(3, 6), stride=(1, 1), padding=(1, 1))
  (pool_1): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
  (conv_2): Conv2d(128, 64, kernel_size=(3, 6), stride=(1, 1), padding=(1, 1))
  (pool_2): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
  (linear_1): Linear(in_features=1152, out_features=64, bias=True)
  (drop_1): Dropout(p=0.2, inplace=False)
  (lstm): GRU(64, 32, num_layers=2, dropout=0.25, bidirectional=True)
  (output): Linear(in_features=64, out_features=20, bias=True)
)

# Train

In [32]:
def run_training():
    
    image_files = sorted(list(map(str, list(data_dir.glob("*.png")))))
    targets_orig = [img.split(os.path.sep)[-1].split(".png")[0] for img in image_files]
    targets = [[c for c in x] for x in targets_orig] #list of targets
    targets_flat = [c for clist in targets for c in clist] #flat targets
    
    lbl_enc = preprocessing.LabelEncoder()
    lbl_enc.fit(targets_flat)
    
    targets_enc = [lbl_enc.transform(x) for x in targets]
    tarfets_enc = np.array(targets_enc) + 1 #To keep '0' for unknown
    
    (train_imgs, test_imgs, train_targets, 
     test_targets, train_orig_targets,
     test_orig_targets) = model_selection.train_test_split(image_files, targets_enc, targets_orig, 
                                                          test_size = 0.1, random_state = 42)
    
    train_dataset = ClassificationDataset(image_paths = train_imgs, targets = train_targets,
                                         resize = (image_height, image_width))
    
    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size = batch_size,
        num_workers = number_workers,
        shuffle = True
    )
    
    
    test_dataset = ClassificationDataset(image_paths = test_imgs, targets = test_targets,
                                         resize = (image_height,image_width),)
    
    test_loader = torch.utils.data.DataLoader(
        test_dataset,
        batch_size = batch_size,
        num_workers = number_workers,
        shuffle = False
    )
    
    model = CaptchaModel(num_chars=len(lbl_enc.classes_))
    model.to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, factor=0.8, patience=5, verbose=True
    )
    for epoch in range(epochs):
        train_loss = engine.train_fn(model, train_loader, optimizer)
        valid_preds, test_loss = engine.eval_fn(model, test_loader)
        valid_captcha_preds = []
        for vp in valid_preds:
            current_preds = decode_predictions(vp, lbl_enc)
            valid_captcha_preds.extend(current_preds)
        combined = list(zip(test_targets_orig, valid_captcha_preds))
        print(combined[:10])
        test_dup_rem = [remove_duplicates(c) for c in test_targets_orig]
        accuracy = metrics.accuracy_score(test_dup_rem, valid_captcha_preds)
        print(
            f"Epoch={epoch}, Train Loss={train_loss}, Test Loss={test_loss} Accuracy={accuracy}"
        )
        scheduler.step(test_loss)
