In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import transforms, models
from PIL import Image
import sys

In [2]:
transform = transforms.Compose([
    transforms.Resize((128, 128)),
    transforms.ToTensor(),
    transforms.Normalize([0.5, 0.5, 0.5], [0.25, 0.25, 0.25])
])

In [3]:
device = torch.device("cpu")
if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_built() and torch.backends.mps.is_available():
    device = torch.device("mps")
print(device)

cuda


In [4]:
CHARS = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
NUM_CLASSES = 38 # 37 + 1 blank
CTC_LABELS = [
    "<BLANK>", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "-", 
    "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", 
    "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z"
]
COCO_TO_CTC = {
    1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10,
    11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16,
    18: 17, 19: 18, 20: 19, 21: 20, 22: 21, 23: 22, 24: 23, 25: 24,
    26: 25, 27: 26, 28: 27, 29: 28, 30: 29, 31: 30, 32: 31, 33: 32,
    34: 33, 35: 34, 36: 35, 37: 36, 38: 37
}

In [5]:
from torchvision.models import resnet50, ResNet50_Weights

class plate_OCR(nn.Module):
    def __init__(self):
        super().__init__()
        resnet = models.resnet50(weights=ResNet50_Weights.DEFAULT)
        modules = list(resnet.children())[:-2]
        self.features = nn.Sequential( #bruh it was pooling too much
            resnet.conv1, resnet.bn1, resnet.relu, resnet.maxpool,
            resnet.layer1, resnet.layer2
        ) 
        #self.dimension_reduction = nn.Linear(1024, 512)
        self.rnn = nn.LSTM(input_size=512, hidden_size=128, num_layers=2, bidirectional=True, batch_first=True)
        self.classify = nn.Linear(128 * 2, NUM_CLASSES) 
        self.height_conv = nn.Conv2d(in_channels=512, out_channels=512, kernel_size=(16, 1), stride=(1, 1), padding=0, bias=False)
        self.relu = nn.ReLU(inplace=True)

    def forward(self, x):
        #print(x.shape)
        x = self.features(x) 
        x = self.relu(self.height_conv(x)) 
        x = x.squeeze(2) 
        x = x.permute(0, 2, 1) 
        #print(x.shape)
        #x = self.dimension_reduction(x)
        #print(x.shape)
        rnn, _ = self.rnn(x)
        result = self.classify(rnn)
        return result

In [6]:
def decode(result):
    pred = result.argmax(-1).squeeze(0).tolist()
    prev = -1
    output = []
    for p in pred:
        if p != prev and p != len(CHARS):
            output.append(CHARS[p])
        prev = p
    return ''.join(output)

In [7]:
from number_coco import license_coco
from number_coco import license_collate
from torch.utils.data import DataLoader
''' for some reason this line wouldn't work here so reput at the top
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Resize((128, 128)),
])
'''
X_train = "../data/license_numbers/train/images"
y_train = "../data/license_numbers/train/annotations.json"

train_dataset = license_coco(root=X_train, ann_file=y_train, transforms=transform)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, num_workers=4, prefetch_factor=2, persistent_workers=True, collate_fn=license_collate)

In [8]:
model = plate_OCR().to(device)
ctc_loss = nn.CTCLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

In [10]:
testing_train_loader = iter(train_loader)

In [60]:
model.train()
images, targets, target_lengths = next(testing_train_loader)  # adjust to your loader
images = images.to(device)
targets = targets.to(device)
target_lengths = target_lengths.to(device)

optimizer.zero_grad()

outputs = model(images) # [B, T, C]
log_probability = F.log_softmax(outputs, dim=2)
log_probability = log_probability.permute(1, 0, 2) # [T, B, C]

# Input lengths: full length for each sequence
input_lengths = torch.full(
    size=(outputs.size(0),),       # B
    fill_value=outputs.size(1),    # T
    dtype=torch.long
).to(device)

assert all(input_lengths >= target_lengths), "Target sequence too long for CTC"

# Loss computation
loss = ctc_loss(log_probability, targets, input_lengths, target_lengths)
print(f"Loss: {loss.item():.4f}")

# Backward and optimize
loss.backward()
optimizer.step()

Loss: 5.2969


In [62]:
#testing one cycle
model.eval()
images, targets, target_lengths = next(testing_train_loader)

images = images.to(device)
targets = targets.to(device)
target_lengths = target_lengths.to(device)

with torch.no_grad():
    output = model(images)
print(output[0])
print(targets[:target_lengths[0]])
print(output[0].shape)
print(output[0].argmax(dim=1))

tensor([[ 2.0919e+00, -2.8863e-01,  5.2536e-02, -1.6744e-01, -6.1107e-01,
          1.2277e-01, -2.6832e-01, -8.1728e-02, -1.7669e-02,  1.3442e-01,
         -1.2289e-01, -1.0407e-01, -3.2711e-01, -5.0249e-01, -6.6734e-02,
         -2.2979e-01, -3.0328e-01,  5.3204e-02, -3.1642e-01, -2.0495e-01,
         -1.8913e-01, -2.3046e-01, -1.9388e-01,  5.3737e-02, -5.8599e-02,
         -3.4637e-01, -3.9886e-02, -4.8267e-02, -1.3595e-01, -2.2445e-01,
         -3.5550e-01, -1.7549e-01, -9.6386e-02, -3.0394e-01, -1.4007e-01,
         -2.1390e-01, -1.4975e-01, -2.5941e-01],
        [ 2.5701e+00, -3.5723e-01,  9.5379e-02, -1.3914e-01, -6.8035e-01,
          1.2131e-01, -2.8729e-01, -9.4315e-02, -1.3110e-02,  2.2055e-01,
         -1.9837e-01, -6.4635e-02, -3.3668e-01, -5.9638e-01, -7.7981e-02,
         -2.9158e-01, -3.8386e-01,  1.3086e-01, -3.0603e-01, -2.2441e-01,
         -3.4260e-01, -3.1012e-01, -2.4930e-01,  2.1154e-02, -5.2714e-02,
         -4.2790e-01, -4.8737e-02, -2.2328e-02, -1.2636e-01, -3

In [None]:
#testing one cycle
model.train()
images, targets, target_lengths = next(testing_train_loader)

images = images.to(device)
targets = targets.to(device)
target_lengths = target_lengths.to(device)
print(target_lengths)

optimizer.zero_grad()
outputs = model(images) # [B, T, C]
print(outputs.shape)
print(outputs)
log_probability = outputs.permute(1, 0, 2) # [T, B, C]
print(log_probability.shape)

log_probability = F.log_softmax(log_probability, dim=2)

# Input lengths: full length for each sequence
input_lengths = torch.full(
    size=(outputs.size(0),),       # B
    fill_value=outputs.size(1),    # T
    dtype=torch.long
).to(device)
print(input_lengths)
print(target_lengths)
assert all(input_lengths >= target_lengths), "Target sequence too long for CTC"

# Loss computation
loss = ctc_loss(log_probability, targets, input_lengths, target_lengths)
print(f"Loss: {loss.item():.4f}")

# Backward and optimize
loss.backward()
optimizer.step()

tensor([7, 0, 8, 6], device='cuda:0')
torch.Size([4, 16, 38])
tensor([[[-0.1104,  0.0869,  0.0212,  ..., -0.0262, -0.0195, -0.0979],
         [-0.1081,  0.0991,  0.0240,  ..., -0.0231, -0.0275, -0.0977],
         [-0.1203,  0.0975,  0.0440,  ..., -0.0397, -0.0223, -0.0958],
         ...,
         [-0.1156,  0.0892,  0.0679,  ..., -0.0024, -0.0121, -0.1438],
         [-0.0961,  0.0845,  0.0551,  ...,  0.0138, -0.0075, -0.1401],
         [-0.0660,  0.0660,  0.0314,  ...,  0.0603, -0.0014, -0.1513]],

        [[-0.1094,  0.0639,  0.0084,  ...,  0.0041, -0.0238, -0.0621],
         [-0.1015,  0.0572,  0.0246,  ...,  0.0041, -0.0236, -0.0793],
         [-0.1047,  0.0594,  0.0490,  ...,  0.0114, -0.0097, -0.0903],
         ...,
         [-0.0730,  0.1294,  0.0465,  ...,  0.0418,  0.0877, -0.1646],
         [-0.0652,  0.1371,  0.0137,  ...,  0.0632,  0.0934, -0.1779],
         [-0.0556,  0.1084, -0.0020,  ...,  0.1112,  0.0581, -0.1776]],

        [[-0.1094,  0.0972,  0.0148,  ..., -0.0086, -0

In [11]:
model = plate_OCR().to(device)
ctc_loss = nn.CTCLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

epochs = 5
for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    epoch_loss   = 0.0
    i = 0

    for images, targets, target_lengths in train_loader:
        images = images.to(device)
        targets = targets.to(device)
        target_lengths = target_lengths.to(device)
        #print(target_lengths)

        optimizer.zero_grad()
        outputs = model(images) # [B, T, C]
        log_probability = F.log_softmax(outputs, dim=2)
        log_probability = log_probability.permute(1, 0, 2) # [T, B, C]

        # Input lengths: full length for each sequence
        input_lengths = torch.full(
            size=(outputs.size(0),),       # B
            fill_value=outputs.size(1),    # T
            dtype=torch.long
        ).to(device)
        
        if (target_lengths > input_lengths).any():
            # batch is impossible for CTC → skip
            print("Skipping batch: target length exceeds input length.")
            optimizer.zero_grad(set_to_none=True)  # cheap no-op
            continue    

        # Loss computation
        loss = ctc_loss(log_probability, targets, input_lengths, target_lengths)
        #print(f"Loss: {loss.item():.4f}")

        # Backward and optimize
        loss.backward()
        optimizer.step()
        
        loss_val = loss.item()
        running_loss += loss_val
        epoch_loss   += loss_val
        
        if i % 50 == 0 or i == len(train_loader):
            print(f"[{epoch+1}/{epochs}, {i:5d}] "
                f"loss/50 = {running_loss/50:.4f} | "
                f"LR = {optimizer.param_groups[0]['lr']:.2e}")
            print(targets[:target_lengths[0]])
            print(outputs[0].argmax(dim=1))
            running_loss = 0.0
        i += 1
        
    print(f"Epoch {epoch+1} finished - avg loss: {epoch_loss/len(train_loader):.4f}\n")

torch.save(model.state_dict(), "number_model.pth")


[1/5,     0] loss/50 = 0.1612 | LR = 1.00e-04
target_length: torch.Size([4]) input length: torch.Size([4])
tensor([34, 32, 14, 32, 20, 20, 21, 31], device='cuda:0')
tensor([11, 11, 11, 25, 11, 27, 11, 11, 11, 25, 25, 25, 25, 25, 25, 25],
       device='cuda:0')
[1/5,    50] loss/50 = 5.6547 | LR = 1.00e-04
target_length: torch.Size([4]) input length: torch.Size([4])
tensor([21, 26,  2, 32, 13, 21, 13], device='cuda:0')
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')
[1/5,   100] loss/50 = 4.0558 | LR = 1.00e-04
target_length: torch.Size([4]) input length: torch.Size([4])
tensor([ 9, 28, 32,  4, 14,  6, 22], device='cuda:0')
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')
[1/5,   150] loss/50 = nan | LR = 1.00e-04
target_length: torch.Size([4]) input length: torch.Size([4])
tensor([ 9,  9,  9, 29, 29, 29], device='cuda:0')
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')
[1/5,   200] loss/50 = nan | LR = 1.00e-

KeyboardInterrupt: 

In [11]:
X_test = "../data/license_numbers/test/images"
y_test = "../data/license_numbers/test/annotations.json"

test_dataset = license_coco(root=X_test, ann_file=y_test, transforms=transform)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=True, num_workers=4, prefetch_factor=2, persistent_workers=True, collate_fn=license_collate)

In [28]:
images, targets, target_lengths = next(iter(train_loader))

images = images.to(device)
targets = targets.to(device)
target_lengths = target_lengths.to(device)
with torch.no_grad():
    output = model(images)
print(output)

tensor([[[-6.7804, -6.6093, -4.1751, -2.9140, -4.0386, -3.4436, -3.6946,
          -3.2153, -5.9917, -8.0706, -4.5897, -3.7022, -3.5820, -2.5212,
          -5.4768, -3.2019, -4.5439, -2.7529, -4.1720, -3.4432, -5.2258,
          -2.1244, -3.5974, -3.7048, -6.8515, -3.9357, -4.8088, -3.9109,
          -4.7454, -6.6749, -5.1487, -3.6975, -5.0427, -4.3240, -3.9655,
          -4.4039, -1.5948, -6.9093, -4.5697, -7.2452],
         [-7.1760, -6.7530, -4.0604, -3.7520, -4.2617, -3.0541, -3.8179,
          -3.6107, -5.9244, -8.2178, -4.4114, -3.3120, -3.8766, -2.7352,
          -5.4026, -2.8435, -4.1236, -3.5076, -4.1026, -3.3576, -4.9164,
          -2.4316, -3.5425, -3.8740, -7.1301, -4.0793, -5.2137, -4.6204,
          -4.9785, -6.9158, -4.4501, -2.9742, -5.0943, -4.3976, -3.4198,
          -5.0700, -1.8349, -7.1707, -4.4202, -7.2544],
         [-7.4924, -7.0548, -4.5867, -4.2835, -4.3063, -3.0697, -4.1318,
          -4.0297, -5.9549, -8.4112, -4.2952, -3.6074, -4.3075, -3.0530,
          -5