In [1]:
import os, cv2, torch
import numpy as np
from torch.utils.data import Dataset

class MultiModalDataset(Dataset):
    def __init__(self, root, split='train', modalities=['rgb','ir'], img_size=(640,640), transforms=None):
        self.root = root
        self.split = split
        self.modalities = modalities
        self.img_size = img_size
        self.transforms = transforms

        rgb_path = os.path.join(root, 'images', split, 'rgb')
        self.ids = [os.path.splitext(f)[0] for f in os.listdir(rgb_path) if f.endswith('.jpg')]

    def __getitem__(self, idx):
        base_id = self.ids[idx]
        imgs = []

        for m in self.modalities:
            suffix = f'_{m}' if m != 'rgb' else ''
            path = os.path.join(self.root, 'images', self.split, m, base_id + suffix + '.jpg')
            img = cv2.imread(path)
            img = cv2.resize(img, self.img_size)
            imgs.append(torch.tensor(img).permute(2, 0, 1) / 255.0)

        labels_path = os.path.join(self.root, 'labels', self.split, base_id + '.txt')
        boxes = np.loadtxt(labels_path).reshape(-1, 5)

        return torch.stack(imgs, dim=0), torch.tensor(boxes, dtype=torch.float32)

    def __len__(self):
        return len(self.ids)


In [2]:
import os, cv2, torch
import numpy as np
from torch.utils.data import Dataset

class MultiModalDataset(Dataset):
    def __init__(self, root, split='train', modalities=['rgb','ir'], img_size=(640,640), transforms=None):
        self.root = root
        self.split = split
        self.modalities = modalities
        self.img_size = img_size
        self.transforms = transforms

        rgb_path = os.path.join(root, 'images', split, 'rgb')
        self.ids = [os.path.splitext(f)[0] for f in os.listdir(rgb_path) if f.endswith('D:\Master\Rewa\3rd semester\Machine Learning in Robotics\Projects\Meta-Transformer-RGB-Depth-Meta-Detection\MultiModal-MetaDetector\data\images\test\ir\rgbir.jpg')]

    def __getitem__(self, idx):
        base_id = self.ids[idx]
        imgs = []

        for m in self.modalities:
            suffix = f'_{m}' if m != 'rgb' else ''
            path = os.path.join(self.root, 'images', self.split, m, base_id + suffix + '.jpg')
            img = cv2.imread(path)
            img = cv2.resize(img, self.img_size)
            imgs.append(torch.tensor(img).permute(2, 0, 1) / 255.0)

        labels_path = os.path.join(self.root, 'labels', self.split, base_id + '.txt')
        boxes = np.loadtxt(labels_path).reshape(-1, 5)

        return torch.stack(imgs, dim=0), torch.tensor(boxes, dtype=torch.float32)

    def __len__(self):
        return len(self.ids)


In [3]:
import torch
import torch.nn as nn

class CrossModalFusion(nn.Module):
    def __init__(self, num_modalities):
        super().__init__()
        self.fc = nn.Linear(num_modalities, 1)

    def forward(self, features):
        stacked = torch.stack(features, dim=0)  # [modalities, batch, C, H, W]
        fused = torch.mean(stacked, dim=0)      # average fusion
        return fused


In [4]:
import torch.nn as nn

class MetaTransformerBackbone(nn.Module):
    def __init__(self):
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Conv2d(3, 64, 3, padding=1),
            nn.ReLU(),
            nn.Conv2d(64, 128, 3, padding=1),
            nn.ReLU()
        )

    def forward(self, x):
        return self.encoder(x)


In [5]:
import torch.nn as nn

class DetectionHead(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.head = nn.Sequential(
            nn.Conv2d(128, 256, 3, padding=1),
            nn.ReLU(),
            nn.Conv2d(256, num_classes + 5, 1)  # class scores + bbox (x,y,w,h,obj)
        )

    def forward(self, x):
        return self.head(x)


In [6]:
import torch
from torch.utils.data import DataLoader
from src.dataloader import MultiModalDataset
from src.model.multimodal_detector import MultiModalMetaDetector

dataset = MultiModalDataset(root='data', split='train')
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)
model = MultiModalMetaDetector(num_classes=10)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

for epoch in range(10):
    for imgs, boxes in dataloader:
        preds = model(imgs)
        loss = ((preds - boxes.mean())**2).mean()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        print(f"Epoch {epoch} | Loss: {loss.item():.4f}")


ModuleNotFoundError: No module named 'src'