In [1]:
import os
import torch
import cv2
import bounding_box
import numpy as np
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim


class LatexDataset(Dataset):
    def __init__(self, image_dir, label_dir, transform=None):
        self.image_dir = image_dir
        self.label_dir = label_dir
        self.transform = transform
        self.image_files = [f for f in os.listdir(image_dir) if f.endswith(".png")]

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        img_name = self.image_files[idx]
        img_path = os.path.join(self.image_dir, img_name)
        label_path = os.path.join(self.label_dir, img_name.replace(".png", ".txt"))

        # Load image
        image = cv2.imread(img_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        h, w, _ = image.shape  # Original image size

        # Load label
        with open(label_path, "r") as f:
            label_data = [list(map(float, line.split())) for line in f.readlines()]
        
        class_labels = []
        bboxes = []

        for label in label_data:
            class_id = int(label[0])
            x_center, y_center, width, height = label[1:]

            x_min = (x_center - width / 2) 
            y_min = (y_center - height / 2)
            x_max = (x_center + width / 2) 
            y_max = (y_center + height / 2)

            class_labels.append(class_id)
            bboxes.append([x_min, y_min, x_max, y_max])

        # Convert to tensors
        image = cv2.resize(image, (128, 128)) / 255.0 
        image = torch.tensor(image, dtype=torch.float32).permute(2, 0, 1)  
        class_labels = torch.tensor(class_labels, dtype=torch.long)
        bboxes = torch.tensor(bboxes, dtype=torch.float32)

        return image, (class_labels, bboxes)

In [2]:
# Load dataset
image_dir = "dataset"
label_dir = "dataset"
train_dataset = LatexDataset(image_dir, label_dir, transform=None)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

In [3]:
class ObjectDetectorSimpleCNN(nn.Module):
    def __init__(self, num_classes):
        super(ObjectDetectorSimpleCNN, self).__init__()

        # Convolutional layers
        self.conv1 = nn.Conv2d(3, 32, 3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(32, 64, 3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(2, 2)

        # Fully connected layers
        self.fc1 = nn.Linear(64 * 32 * 32, 128)

        # Class prediction head
        self.fc_class = nn.Linear(128, num_classes)

        # Bounding Box prediction head (x_min, y_min, x_max, y_max)
        self.fc_bbox = nn.Linear(128, 4)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(x.shape[0], -1)  # Flatten

        x = F.relu(self.fc1(x))

        class_output = self.fc_class(x)  # Classification
        bbox_output = self.fc_bbox(x)  # Bounding Box Regression

        return class_output, bbox_output


model = ObjectDetectorSimpleCNN(len(bounding_box.types))
model

ObjectDetectorSimpleCNN(
  (conv1): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=65536, out_features=128, bias=True)
  (fc_class): Linear(in_features=128, out_features=10, bias=True)
  (fc_bbox): Linear(in_features=128, out_features=4, bias=True)
)

In [4]:
class_loss_fn = nn.CrossEntropyLoss()
bbox_loss_fn = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    running_class_loss, running_bbox_loss = 0.0, 0.0

    for images, (labels, bboxes) in train_loader:
        images, labels, bboxes = images.to(device), labels.to(device), bboxes.to(device)
        
        optimizer.zero_grad()
        class_preds, bbox_preds = model(images)

        loss_class = class_loss_fn(class_preds, labels)
        loss_bbox = bbox_loss_fn(bbox_preds, bboxes)

        loss = loss_class + loss_bbox  # Total loss
        loss.backward()
        optimizer.step()

        running_class_loss += loss_class.item()
        running_bbox_loss += loss_bbox.item()

    print(f"Epoch {epoch+1}, Class Loss: {running_class_loss:.4f}, BBox Loss: {running_bbox_loss:.4f}")

print("Training complete!")

RuntimeError: stack expects each tensor to be equal size, but got [4] at entry 0 and [3] at entry 2