In [5]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor
from torchvision import datasets, transforms
from torch.utils.data import Dataset
from torchvision.io import read_image
from torch.utils.data import random_split
import torch.nn.functional as F
import torchvision.transforms as transforms
from torchvision.transforms import ToTensor, Normalize, Compose, Resize
import os
from PIL import Image


  from .autonotebook import tqdm as notebook_tqdm


In [6]:
import os

def get_all_labels(img_dir):
    # List all files in the directory
    filenames = os.listdir(img_dir)
    
    # Extract labels from filenames
    labels = set()
    for filename in filenames:
        name = os.path.splitext(filename)[0]  # remove the extension
        labels.update(list(name))  # add each character in the filename to the set of labels
    
    return labels

img_dir = 'images'  # replace with your directory path
all_labels = get_all_labels(img_dir)

print(f"Unique labels: {all_labels}")


Unique labels: {'g', 'b', 'M', 'C', 'm', 'Q', 'y', 'S', 'q', 'E', 'B', 'x', 'v', 't', 'P', 'I', '9', 'R', 'p', 'e', 'X', 'h', 'r', '2', 'Y', 'Z', 'V', '5', 's', '3', '7', 'a', 'K', 'd', '8', 'k', 'U', 'i', '1', 'H', 'n', 'O', 'J', 'A', 'W', 'G', '4', 'j', 'F', 'D', 'T', 'z', 'L', 'f', 'u', '6', 'c', 'w', 'N', 'l'}


In [7]:
len(all_labels)

60

In [8]:
class CustomImageDataset(Dataset):
    def __init__(self, img_dir, transform=None):
        self.img_dir = img_dir
        self.img_names = os.listdir(img_dir)
        self.transform = transform
        self.label_mapping = {label: idx for idx, label in enumerate(all_labels)}

    def __len__(self):
        return len(self.img_names)

    def filename_to_labels(self, filename):
        # Remove the file extension
        name = os.path.splitext(filename)[0]
        # Create a binary label vector for each character in the filename
        labels = torch.zeros(len(self.label_mapping) * 5)
        for char in name:
            if char in self.label_mapping:
                labels[self.label_mapping[char]*5 : (self.label_mapping[char]+1)*5] = 1
        return labels


    def __getitem__(self, idx):
        img_path = os.path.join(self.img_dir, self.img_names[idx])
        image = Image.open(img_path)

        # Convert the filename to labels
        labels = self.filename_to_labels(self.img_names[idx])

        if self.transform:
            image = self.transform(image)
            
            
        return image, labels


In [9]:
transform = Compose([
    Resize((150,40)),  # Resize the images to 150x40
    ToTensor(),  # Convert the image to a tensor
])

In [10]:
dataset = CustomImageDataset('images', transform= transform)

In [11]:
train_proportion = 0.8
train_size = int(len(dataset) * train_proportion)
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

In [12]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)

In [13]:
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        # Input: 150x40x3
        self.conv1_1 = nn.Conv2d(3, 64, kernel_size=3, padding=1)
        self.conv1_2 = nn.Conv2d(64, 64, kernel_size=3, padding=1)
        self.pool1 = nn.MaxPool2d(2, 2)

        # After pool1: 75x20x64
        self.conv2_1 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
        self.pool2 = nn.MaxPool2d(2, 2)
        
        # After pool2: 37x10x128
        self.fc1 = nn.Linear(37*10*128, 1024)  
        self.fc2 = nn.Linear(1024, 512)  
        self.fc3 = nn.Linear(512, 60*5)  # output layer

    def forward(self, x):
        x = F.relu(self.conv1_1(x))
        x = self.pool1(F.relu(self.conv1_2(x)))

        x = self.pool2(F.relu(self.conv2_1(x)))

        x = x.view(x.size(0), -1)  # flatten the tensor
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)  # we can apply a sigmoid here if we are using BCEWithLogitsLoss as the loss function
        return x


In [14]:
model = CNN().to(device)

In [15]:
class CustomMSELoss(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, input, target):
        # Reshape the inputs to be 2D tensors with 5 columns
        input = input.view(-1, 5)
        target = target.view(-1, 5)

        # Calculate the mean squared error for each 5-dimensional vector
        mse = (input - target).pow(2).mean(dim=1)

        # Return the average MSE over all vectors
        return mse.mean()


In [16]:
loss_fn = CustomMSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


In [17]:
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)

        # Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch % 100 == 0:
            loss, current = loss.item(), (batch + 1) * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

In [18]:
def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()

            # Convert the model's outputs into predictions
            pred = torch.sigmoid(pred)  # Apply the sigmoid function to convert the scores into probabilities
            pred = (pred > 0.5).float()  # Threshold the probabilities at 0.5 to get the predictions

            # Calculate the number of correct predictions
            correct += (pred == y).type(torch.float).sum().item()

    test_loss /= num_batches
    correct /= size * 300  # Divide by the total number of labels
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")


In [19]:
epochs = 5
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_loader, model, loss_fn, optimizer)
    test(test_loader, model, loss_fn)
print("Done!")

Epoch 1
-------------------------------
loss: 0.081694  [   64/90449]
loss: 0.074096  [ 6464/90449]
loss: 0.075192  [12864/90449]
loss: 0.074049  [19264/90449]
loss: 0.074302  [25664/90449]
loss: 0.074539  [32064/90449]
loss: 0.074719  [38464/90449]
loss: 0.074638  [44864/90449]
loss: 0.074398  [51264/90449]
loss: 0.073975  [57664/90449]
loss: 0.072890  [64064/90449]
loss: 0.072880  [70464/90449]
loss: 0.073369  [76864/90449]
loss: 0.072812  [83264/90449]
loss: 0.071818  [89664/90449]
Test Error: 
 Accuracy: 8.2%, Avg loss: 0.073330 

Epoch 2
-------------------------------
loss: 0.072181  [   64/90449]
loss: 0.072649  [ 6464/90449]
loss: 0.073190  [12864/90449]
loss: 0.072822  [19264/90449]
loss: 0.074148  [25664/90449]
loss: 0.072498  [32064/90449]


In [None]:
torch.save(model.state_disct(), 'model.pth')

In [None]:
# Load and preprocess the image
img_path = 'images'  
image = Image.open(img_path)
image = transform(image)  # apply the same transformations as during training
image = image.unsqueeze(0)  # add an extra dimension for the batch size
image = image.to(device)  # move the image to the device where the model is

# Make a prediction
with torch.no_grad():  # disable gradient calculations
    model.eval()  # put the model in evaluation mode
    output = model(image)  # get the raw output

# Postprocess the output
prob = torch.sigmoid(output)  # convert the raw output into probabilities
pred = (prob > 0.5).float()  # threshold the probabilities at 0.5 to get the predictions


NameError: name 'Image' is not defined