In [8]:
import os
import string

import matplotlib.pyplot as plt
import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset

In [9]:
CAPTCHA_PATH = './captcha_dataset'
PATH = './per_captcha_model.pth'

list_of_characters = list(string.digits) + list(string.ascii_lowercase)
nchar = len(list_of_characters)
train_pct = 0.8
training_epochs = 500

In [10]:
captchas = os.listdir(CAPTCHA_PATH)
solutions = [file[:5] for file in captchas]
X = np.zeros((len(captchas), 50, 200, 1))

for i, img in enumerate(captchas):
    image = plt.imread(f"{CAPTCHA_PATH}/{img}")

    array_image = np.array(image) / 255
    array_image = np.reshape(array_image[:, :, 0], (50, 200, 1))
    X[i] = (array_image)

In [11]:
y = np.zeros((5 , len(captchas), nchar))
for i, captcha in enumerate(solutions):
   
    temp_label = np.zeros((5, nchar))

    for j, character in enumerate(captcha):
        character_index = list_of_characters.index(character)

        temp_label[j, character_index] = 1
        
    y[:, i, :] = temp_label

In [12]:
class CaptchaModel(nn.Module):
    def __init__(self, nchar):
        super(CaptchaModel, self).__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, padding=1)
        self.mp1 = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, padding=1)
        self.mp2 = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        self.conv3 = nn.Conv2d(32, 32, kernel_size=3, padding=1)
        self.bn = nn.BatchNorm2d(32)
        self.mp3 = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        
        self.fc_shared = nn.Linear(4800, 64)
        self.fc_outputs = nn.ModuleList([nn.Linear(64, nchar) for _ in range(5)])

    def forward(self, x):
        x = self.conv1(x)
        x = nn.functional.relu(x)
        x = self.mp1(x)
        
        x = self.conv2(x)
        x = nn.functional.relu(x)
        x = self.mp2(x)
        
        x = self.conv3(x)
        x = nn.functional.relu(x)
        x = self.bn(x)
        x = self.mp3(x)

        x = x.view(x.size(0), -1)
        
        x = self.fc_shared(x)
        x = nn.functional.relu(x)
        
        outputs = [output_layer(x) for output_layer in self.fc_outputs]
        return outputs


model = CaptchaModel(nchar)

# Define loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [13]:
cut = int(train_pct * len(captchas))

# Convert your data and labels to PyTorch tensors
X_tensor = torch.FloatTensor(X.transpose(0, 3, 1, 2))
y_tensor = torch.LongTensor(y) 

# Split the data into training and validation sets
X_train = X_tensor[:cut]
y_train = y_tensor[:, :cut, :]

X_val = X_tensor[cut:]
y_val = y_tensor[:, cut:, :]

train_dataset = TensorDataset(
    X_train,
    y_train[0],
    y_train[1],
    y_train[2],
    y_train[3],
    y_train[4]
)

test_dataset = TensorDataset(
    X_val,
    y_val[0],
    y_val[1],
    y_val[2],
    y_val[3],
    y_val[4]
)

train_loader = DataLoader(
    train_dataset,
    batch_size=32,
    shuffle=False
)

test_loader = DataLoader(
    test_dataset,
    batch_size=32,
    shuffle=False
)

In [14]:
for epoch in range(training_epochs):
    model.train()

    running_loss = 0.0
    # Iterate over batches of the training dataset
    for inputs, target0, target1, target2, target3, target4 in train_loader:
        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs)
        
        # Calculate the loss for each output and sum them up
        targets = [target0, target1, target2, target3, target4]
        losses = [criterion(output, target.float()) for output, target in zip(outputs, targets)]
        loss = sum(losses)

        # Backpropagation
        loss.backward()
        
        # Update the model's parameters
        optimizer.step()
        
        running_loss += loss.item()
    
    # Print the average loss for this epoch every 10 epochs
    if (epoch+1) % 10 == 0:
        print(f"Epoch {epoch+1} - Loss: {running_loss / len(train_loader)}")

Epoch 10 - Loss: 14.39334124105948
Epoch 20 - Loss: 14.221042032595035
Epoch 30 - Loss: 14.052305398163972
Epoch 40 - Loss: 13.969622859248409
Epoch 50 - Loss: 13.995127360026041
Epoch 60 - Loss: 13.871580088580096
Epoch 70 - Loss: 13.79713973292598
Epoch 80 - Loss: 13.778606202867296


KeyboardInterrupt: 

### Saving trained network

In [None]:
torch.save(model.state_dict(), PATH)

### Loading trained network

In [None]:
model = CaptchaModel(nchar)
model.load_state_dict(torch.load(PATH))

In [None]:
# Gets predicted labels from test dataset
pred_labels = torch.zeros((len(X_val), 5, 1))

with torch.no_grad():
    model.eval()

    for i, input in enumerate(X_val):
        temp_labels = torch.zeros((5,1))

        outputs = model(input.unsqueeze(0))

        for j, num in enumerate(outputs):
            temp_labels[j] = num.argmax(dim=1)

        pred_labels[i] = temp_labels  

### Turn labels into indexes to compare to predicted

In [None]:
y_test_indexes = torch.zeros((214, 5, 1))

for i in range(214):
    temp_label = torch.zeros((5, 1))

    for j in range(5):
        itemindex = np.where(y_val[:,i][j] == 1 )[0][0]
        temp_label[j] = itemindex

    y_test_indexes[i] = temp_label

### Evaluating model accuracy

In [None]:
correct = 0

for i in range(y_val.shape[1]):
    a = y_test_indexes[i] == pred_labels[i]

    if False not in a:
        correct += 1

accuracy = correct / y_val.shape[1] * 100
print(f"Correct CAPTCHAs: {accuracy:.2f}%")

In [None]:
correct = 0
total_digits = 0

for i in range(y_val.shape[1]):
    for j in range(5):
        total_digits += 1
        if y_test_indexes[i][j] == pred_labels[i][j]:
            correct += 1

accuracy = correct / total_digits * 100
print(f"Correct digits: {accuracy:.2f}%")