# **Import**

In [None]:
!pip3 install wandb matplotlib torch torchviz torchvision torchsummary torchviz weave nbformat netron onnx roboflow scikit-learn roboflow netron --quiet

import os
from os import path
import json
import random
import math
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.optim.lr_scheduler as lr_scheduler
import torch.utils.data as data
from PIL import Image, ImageDraw
from torchvision import datasets, transforms, utils
import wandb
import netron
from roboflow import Roboflow

# **Util Functions**

In [None]:
def show_image(img):
    plt.imshow(transforms.functional.to_pil_image(img))
    plt.show()

def collate_fn(batch):
    images, labels = zip(*batch)
    images = torch.stack(images, dim=0)
    return images, labels

def show_image_with_labels(image, labels, class_names=None):
    image_np = image.permute(1, 2, 0).numpy()
    h, w, _ = image_np.shape

    fig, ax = plt.subplots(1, figsize=(8, 8))
    ax.imshow(image_np)

    for label in labels:
        class_id, x_center, y_center, bw, bh = label.tolist()
        x = (x_center - bw / 2) * w
        y = (y_center - bh / 2) * h
        box_w = bw * w
        box_h = bh * h
        rect = patches.Rectangle((x, y), box_w, box_h, linewidth=2, edgecolor='red', facecolor='none')
        ax.add_patch(rect)
        if class_names:
            class_text = class_names[int(class_id)]
        else:
            class_text = str(int(class_id))
        ax.text(x, y - 5, class_text, color='white', fontsize=12,bbox=dict(facecolor='red', alpha=0.5, pad=2))
    plt.axis('off')
    plt.show()

random.seed(42)
np.random.seed(42)

classes_types = {
    0: 'gold_ore',
    1: 'iron_ore',
    2: 'diamond_ore',
    3: 'redstone_ore',
    4: 'deepslate_iron_ore'
}
classes_number = len(classes_types)


# **Dataset**

In [None]:
# Download dataset only if not already present
dataset_dir = "minecraft-ore-1"
from roboflow import Roboflow
rf = Roboflow(api_key="bVvy50uMbWp85HBSLUkm")
project = rf.workspace("oblig10").project("minecraft-ore")
version = project.version(1)
dataset = version.download("yolov5")

# Remove problematic file
file_path_label = "minecraft-ore-1/valid/labels/2024-04-25_19-38-10_png_jpg.rf.627bb52ce40ad0431564b93df2aa900f.txt"
file_path_image = "minecraft-ore-1/valid/images/2024-04-25_19-38-10_png_jpg.rf.627bb52ce40ad0431564b93df2aa900f.jpg"
if os.path.exists(file_path_label):
  os.remove(file_path_label)
  print(f"Deleted: {file_path_label}")
if os.path.exists(file_path_image):
  os.remove(file_path_image)
  print(f"Deleted: {file_path_image}")

In [None]:
import os
from PIL import Image
from os import path
import torch
from torch.utils.data import Dataset
from torchvision import transforms

class MinecraftV1(Dataset):
    def __init__(self, root, train=True, valid=False, transform=None):
        super().__init__()

        self.root = root
        self.train = train
        self.valid = valid
        self.transform = transform

        if train:
            self.data_path = path.join(root, 'train')
        elif valid:
            self.data_path = path.join(root, 'valid')
        else:
            self.data_path = path.join(root, 'test')

        self.images_path = path.join(self.data_path, 'images')
        self.labels_path = path.join(self.data_path, 'labels')
        self.data_images = []
        self.data_labels = []
        image_files = sorted(os.listdir(self.images_path))
        label_files = sorted(os.listdir(self.labels_path))
        for image_file in image_files:
            image_path = path.join(self.images_path, image_file)
            self.data_images.append(image_path)

        for label_file in label_files:
            label_path = path.join(self.labels_path,label_file)
            self.data_labels.append(label_path)

    def __len__(self):
        return len(self.data_images)

    def __getitem__(self, idx):
        image_path = self.data_images[idx]
        label_path = self.data_labels[idx]

        image = Image.open(image_path).convert('RGB')
        if self.transform:
            image = self.transform(image)
        with open(label_path, 'r') as file:
            lines = file.readlines()
        labels = [list(map(float, line.strip().split())) for line in lines]
        labels = torch.tensor(labels, dtype=torch.float32)

        return image, labels

class AddGaussianNoise(object):
    def __init__(self, mean=0., std=0.05):
        self.mean = mean
        self.std = std

    def __call__(self, tensor):
        noise = torch.randn_like(tensor) * self.std + self.mean
        tensor = tensor + noise
        return torch.clamp(tensor, 0., 1.)

    def __repr__(self):
        return f"{self.__class__.__name__}(mean={self.mean}, std={self.std})"

basic_transform = transforms.Compose([
    transforms.Resize((640, 640)),
    transforms.ToTensor(),
])

augmented_transform = transforms.Compose([
    transforms.Resize((640, 640)),
    transforms.ColorJitter(brightness=0.3, contrast=0.3, saturation=0.2, hue=0.05),
    transforms.ToTensor(),
    AddGaussianNoise(0., 0.03)
])

mc_train = MinecraftV1(root=os.path.join(os.getcwd(), "minecraft-ore-1"), transform=augmented_transform)
mc_test = MinecraftV1(root=os.path.join(os.getcwd(), "minecraft-ore-1"), train=False, transform=basic_transform)
mc_valid = MinecraftV1(root=os.path.join(os.getcwd(), "minecraft-ore-1"), train=False,valid=True, transform=basic_transform)


# **DataLoader**

In [None]:
trainloader = data.DataLoader(mc_train, batch_size=4, shuffle=True, collate_fn=collate_fn, num_workers=0)
validloader = data.DataLoader(mc_valid, batch_size=4, shuffle=False, collate_fn=collate_fn, num_workers=0)
testloader = data.DataLoader(mc_test, batch_size=4, shuffle=False, collate_fn=collate_fn, num_workers=0)

images, labels = next(iter(trainloader))

for i in range(4):
    show_image_with_labels(images[i], labels[i], class_names=classes_types)


# **Device**

In [None]:
def get_device():
    if torch.cuda.is_available():
        return torch.device("cuda")
    else:
        return torch.device("cpu")

device = get_device()
print(f"Using device: {device}")


# **Yolo V5**

## Architecture

In [None]:
import torch
import torch.nn as nn

# Conv-BN-SiLU block
class CBS(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, padding=None):
        super().__init__()
        if padding is None:
            padding = (kernel_size - 1) // 2
        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, bias=False)
        self.batch_norm = nn.BatchNorm2d(out_channels)
        self.activation = nn.SiLU()
    def forward(self, x):
        return self.activation(self.batch_norm(self.conv(x)))

# Bottleneck for C3
class Bottleneck(nn.Module):
    def __init__(self, in_channels, out_channels, shortcut=True, expansion=0.5):
        super().__init__()
        hidden_channels = int(out_channels * expansion)
        self.conv1 = CBS(in_channels, hidden_channels, 1)
        self.conv2 = CBS(hidden_channels, out_channels, 3)
        self.use_shortcut = shortcut and in_channels == out_channels
    def forward(self, x):
        out = self.conv2(self.conv1(x))
        return x + out if self.use_shortcut else out

# CSP C3 block
class C3(nn.Module):
    def __init__(self, in_channels, out_channels, num_blocks=1, shortcut=True, expansion=0.5):
        super().__init__()
        hidden_channels = int(out_channels * expansion)
        self.conv1 = CBS(in_channels, hidden_channels, 1)
        self.conv2 = CBS(in_channels, hidden_channels, 1)
        self.bottlenecks = nn.Sequential(
            *[Bottleneck(hidden_channels, hidden_channels, shortcut, expansion) for _ in range(num_blocks)]
        )
        self.conv3 = CBS(2 * hidden_channels, out_channels, 1)
    def forward(self, x):
        return self.conv3(torch.cat((self.bottlenecks(self.conv1(x)), self.conv2(x)), dim=1))

# SPPF block
class SPPF(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size=5):
        super().__init__()
        hidden_channels = in_channels // 2
        self.conv1 = CBS(in_channels, hidden_channels, 1)
        self.maxpool = nn.MaxPool2d(kernel_size=kernel_size, stride=1, padding=kernel_size // 2)
        self.conv2 = CBS(hidden_channels * 4, out_channels, 1)
    def forward(self, x):
        x = self.conv1(x)
        y1 = self.maxpool(x)
        y2 = self.maxpool(y1)
        y3 = self.maxpool(y2)
        return self.conv2(torch.cat([x, y1, y2, y3], dim=1))

# CSPDarknet Backbone
class CSPDarknet(nn.Module):
    def __init__(self):
        super().__init__()
        self.stem = CBS(3, 64, kernel_size=6, stride=2, padding=2)
        self.stage2 = nn.Sequential(
            CBS(64, 128, 3, 2),
            C3(128, 128, num_blocks=2)
        )
        self.stage3 = nn.Sequential(
            CBS(128, 256, 3, 2),
            C3(256, 256, num_blocks=4)
        )
        self.stage4 = nn.Sequential(
            CBS(256, 512, 3, 2),
            C3(512, 512, num_blocks=6)
        )
        self.stage5 = nn.Sequential(
            CBS(512, 1024, 3, 2),
            C3(1024, 1024, num_blocks=2),
            SPPF(1024, 1024, kernel_size=5)
        )
    def forward(self, x):
        x = self.stem(x)
        feature2 = self.stage2(x)
        feature3 = self.stage3(feature2)
        feature4 = self.stage4(feature3)
        feature5 = self.stage5(feature4)
        return feature3, feature4, feature5  # P3/8, P4/16, P5/32

# PANet Neck
class PANet(nn.Module):
    def __init__(self):
        super().__init__()
        # top-down pathway
        self.reduce_conv_p5     = CBS(1024, 512, 1)
        self.reduce_conv_p4     = CBS(512, 512, 1)
        self.c3_topdown_p4      = C3(1024, 512, shortcut=False, num_blocks=2)
        self.reduce_conv_p4_td  = CBS(512, 256, 1)
        self.reduce_conv_p3     = CBS(256, 256, 1)
        self.c3_topdown_p3      = C3(512, 256, shortcut=False, num_blocks=2)
        # bottom-up pathway
        self.downsample_conv_p3 = CBS(256, 256, 3, 2)
        self.c3_bottomup_p4     = C3(256+512, 512,  shortcut=False, num_blocks=1)
        self.downsample_conv_p4 = CBS(512, 512, 3, 2)
        self.c3_bottomup_p5     = C3(512+512, 1024,  shortcut=False, num_blocks=1)
        self.upsample           = nn.Upsample(scale_factor=2, mode='nearest')

    def forward(self, x3, x4, x5):
        # top-down
        p5 = self.reduce_conv_p5(x5)
        p5_up = self.upsample(p5)
        p4 = self.reduce_conv_p4(x4)
        p4_td = self.c3_topdown_p4(torch.cat([p4, p5_up], 1))

        p4_td_reduced = self.reduce_conv_p4_td(p4_td)
        p4_up = self.upsample(p4_td_reduced)
        p3 = self.reduce_conv_p3(x3)
        p3_td = self.c3_topdown_p3(torch.cat([p3, p4_up], 1))

        # bottom-up
        p3_down = self.downsample_conv_p3(p3_td)
        p4_bu = self.c3_bottomup_p4(torch.cat([p3_down, p4_td], 1))

        p4_down = self.downsample_conv_p4(p4_bu)
        p5_bu = self.c3_bottomup_p5(torch.cat([p4_down, p5], 1))
        return p3_td, p4_bu, p5_bu

# Detect Head
class Detect(nn.Module):
    def __init__(self, num_classes=5, anchors=(), channels=()):
        super().__init__()
        self.num_classes = num_classes
        self.num_outputs = num_classes + 5
        self.num_layers = len(anchors)
        self.num_anchors = len(anchors[0]) // 2
        self.register_buffer('anchors', torch.tensor(anchors).float().view(self.num_layers, -1, 2))
        self.detect_convs = nn.ModuleList([nn.Conv2d(ch, self.num_outputs * self.num_anchors, 1) for ch in channels])
    def forward(self, features):
        outputs = []
        for i in range(self.num_layers):
            pred = self.detect_convs[i](features[i])
            batch_size, _, height, width = pred.shape
            pred = pred.view(batch_size, self.num_anchors, self.num_outputs, height, width).permute(0, 1, 3, 4, 2).contiguous()
            outputs.append(pred)
        return outputs

# YOLOv5m 6.0
class YOLOv5m(nn.Module):
    def __init__(self, num_classes=5, anchors=None):
        super().__init__()
        if anchors is None:
            anchors = [
                [10, 13, 16, 30, 33, 23],
                [30, 61, 62, 45, 59, 119],
                [116, 90, 156, 198, 373, 326]
            ]
        self.backbone = CSPDarknet()
        self.neck = PANet()
        self.detect = Detect(num_classes, anchors, channels=[256, 512, 1024])

    def forward(self, x):
        feature_p3, feature_p4, feature_p5 = self.backbone(x)
        neck_p3, neck_p4, neck_p5 = self.neck(feature_p3, feature_p4, feature_p5)
        return self.detect([neck_p3, neck_p4, neck_p5])

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = YOLOv5m(num_classes=classes_number).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Netron visualization
x = torch.randn(1, 3, 640, 640).to(device)
y = model(x)



In [None]:
import torch
import torch.nn.functional as F
import math

def bbox_iou(box1, box2, CIoU=False, eps=1e-7):
    """
    IoU or CIoU between box1 and box2. Boxes are [N, 4] in (x, y, w, h) format (center).
    """
    # Convert to x1y1x2y2
    b1_x1, b1_y1 = box1[..., 0] - box1[..., 2] / 2, box1[..., 1] - box1[..., 3] / 2
    b1_x2, b1_y2 = box1[..., 0] + box1[..., 2] / 2, box1[..., 1] + box1[..., 3] / 2
    b2_x1, b2_y1 = box2[..., 0] - box2[..., 2] / 2, box2[..., 1] - box2[..., 3] / 2
    b2_x2, b2_y2 = box2[..., 0] + box2[..., 2] / 2, box2[..., 1] + box2[..., 3] / 2

    inter = (torch.min(b1_x2, b2_x2) - torch.max(b1_x1, b2_x1)).clamp(0) * \
            (torch.min(b1_y2, b2_y2) - torch.max(b1_y1, b2_y1)).clamp(0)
    area1 = (b1_x2 - b1_x1).clamp(0) * (b1_y2 - b1_y1).clamp(0)
    area2 = (b2_x2 - b2_x1).clamp(0) * (b2_y2 - b2_y1).clamp(0)
    union = area1 + area2 - inter + eps
    iou = inter / union

    if CIoU:
        # center distance
        c2 = (torch.max(b1_x2, b2_x2) - torch.min(b1_x1, b2_x1)) ** 2 + \
             (torch.max(b1_y2, b2_y2) - torch.min(b1_y1, b2_y1)) ** 2
        rho2 = (box1[..., 0] - box2[..., 0]) ** 2 + (box1[..., 1] - box2[..., 1]) ** 2
        c2 = c2 + eps

        # aspect ratio
        v = (4 / (math.pi ** 2)) * torch.pow(torch.atan(box1[..., 2] / (box1[..., 3] + eps)) - torch.atan(box2[..., 2] / (box2[..., 3] + eps)), 2)
        with torch.no_grad():
            alpha = v / (1 - iou + v + eps)
        ciou = iou - (rho2 / c2) - alpha * v
        return ciou
    return iou

def build_targets(p, targets, anchors, device):
    """
    Builds training targets for YOLOv5 loss computation.
    Args:
        p (list): List of prediction tensors for each detection scale.
        targets (list): List of ground truth label tensors for each image in the batch.
        anchors (list): List of anchor arrays for each scale.
        num_classes (int): Number of classes.
        device (torch.device): Device to use for tensor operations.
    Returns:
        tcls (list): List of class target tensors for each scale.
        tbox (list): List of box target tensors for each scale.
        indices (list): List of tuples containing indices for each scale.
        anch (list): List of anchor tensors for each scale.
    """
    na = 3  # anchors per scale
    nl = len(anchors)
    tcls, tbox, indices, anch = [], [], [], []
    gain = torch.ones(7, device=device)  # normalized to gridspace gain

    # Convert targets to (image, class, x, y, w, h)
    targets = [torch.cat([torch.full((l.size(0), 1), i, device=device), l], 1) for i, l in enumerate(targets)]
    targets = torch.cat(targets, 0) if len(targets) else torch.zeros((0, 6), device=device)
    if targets.numel() == 0:
        for i in range(nl):
            tcls.append(torch.zeros(0, device=device, dtype=torch.long))
            tbox.append(torch.zeros(0, 4, device=device))
            indices.append((torch.zeros(0, dtype=torch.long),)*4)
            anch.append(torch.zeros(0, 2, device=device))
        return tcls, tbox, indices, anch

    # Append anchor indices to targets
    ai = torch.arange(na, device=device).float().view(na, 1).repeat(1, targets.shape[0])
    targets = torch.cat((targets.repeat(na, 1, 1), ai[..., None]), 2)  # (na, nt, 7)
    targets = targets.view(-1, 7)

    g = 0.5  # bias
    off = torch.tensor([[0, 0], [1, 0], [0, 1], [-1, 0], [0, -1]], device=device).float() * g

    for i in range(nl):
        anchors_i = torch.tensor(anchors[i], device=device).float().view(na, 2)
        gain[2:6] = torch.tensor(p[i].shape)[[3, 2, 3, 2]]
        t = targets.clone()
        t[:, 2:6] *= gain[2:6]  # scale to grid

        # Match targets to anchors
        r = t[:, 4:6] / anchors_i[t[:, 6].long()]
        j = torch.max(r, 1. / r).max(1)[0] < 4.0  # anchor_t=4.0
        t = t[j]

        # Offsets
        gxy = t[:, 2:4]
        gxi = gain[[2, 3]] - gxy
        j, k = ((gxy % 1 < g) & (gxy > 1)).T
        l, m = ((gxi % 1 < g) & (gxi > 1)).T
        j = torch.stack((torch.ones_like(j), j, k, l, m))
        t = t.repeat((5, 1, 1))[j]
        offsets = (torch.zeros_like(gxy)[None] + off[:, None])[j]

        # Define
        bc = t[:, :2].long()
        gxy = t[:, 2:4]
        gwh = t[:, 4:6]
        a = t[:, 6].long()
        gij = (gxy - offsets).long()
        gi, gj = gij.T

        b, c = bc.T
        indices.append((b, a, gj.clamp_(0, p[i].shape[2] - 1), gi.clamp_(0, p[i].shape[3] - 1)))
        tbox.append(torch.cat((gxy - gij, gwh), 1))
        anch.append(anchors_i[a])
        tcls.append(c)
    return tcls, tbox, indices, anch

def compute_loss(p, targets, anchors, num_classes, device):
    """
    Compute YOLOv5 loss: box (CIoU), obj (BCE), cls (BCE).
    Args:
        p: list of predictions for each scale
        targets: list of batch label tensors
        anchors: list of anchor arrays for each scale
        num_classes: number of classes
        device: torch.device
    Returns:
        total_loss, lobj, lbox, lcls
    """
    BCEcls = torch.nn.BCEWithLogitsLoss()
    BCEobj = torch.nn.BCEWithLogitsLoss()
    lcls = torch.zeros(1, device=device)
    lbox = torch.zeros(1, device=device)
    lobj = torch.zeros(1, device=device)
    tcls, tbox, indices, anch = build_targets(p, targets, anchors, num_classes, device)
    for i, pi in enumerate(p):
        b, a, gj, gi = indices[i]
        tobj = torch.zeros(pi.shape[:4], dtype=pi.dtype, device=device)
        n = b.shape[0]
        if n:
            # Gather predictions
            ps = pi[b, a, gj, gi]
            # Decode box
            pxy = ps[:, :2].sigmoid() * 2 - 0.5
            pwh = (ps[:, 2:4].sigmoid() * 2) ** 2 * anch[i]
            pbox = torch.cat((pxy, pwh), 1)
            iou = bbox_iou(pbox, tbox[i], CIoU=True)
            lbox += (1.0 - iou).mean()
            # Objectness
            iou = iou.detach().clamp(0)
            tobj[b, a, gj, gi] = iou
            # Classification
            if num_classes > 1:
                t = torch.full_like(ps[:, 5:], 0.0, device=device)
                t[range(n), tcls[i]] = 1.0
                lcls += BCEcls(ps[:, 5:], t)
        lobj += BCEobj(pi[..., 4], tobj)
    total_loss = lbox + lobj + lcls
    return total_loss, lobj, lbox, lcls


# Training

In [None]:
model = YOLOv5m(num_classes=classes_number).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.01)
scheduler = lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1)

In [None]:
num_epochs = 100
print(f"Using device: {device}")

anchors = [
    [10, 13, 16, 30, 33, 23],
    [30, 61, 62, 45, 59, 119],
    [116, 90, 156, 198, 373, 326]
]
train_array = []
valid_array = []

run = wandb.init(
    entity="s-gardier-work",
    project="yolov5",
    config={
        "architecture": "YOLOv5",
        "dataset": "https://universe.roboflow.com/oblig10/minecraft-ore/dataset/1",
        "epochs": num_epochs,
    },
)

anchors = [np.array(a).reshape(-1, 2) for a in anchors]
min_validation_loss = np.inf
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_epochs)
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for batch_idx, (images, labels) in enumerate(trainloader):
        images = images.to(device)
        labels = [label.to(device) for label in labels]
        outputs = model(images)
        total_loss, lobj, lbox, lcls = compute_loss(
            outputs, labels, anchors, classes_number, device
        )
        optimizer.zero_grad()
        total_loss.backward()
        optimizer.step()
        running_loss += total_loss.item()
    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {running_loss / len(trainloader):.4f}")
    epoch_loss = running_loss / len(trainloader)
    train_array.append(epoch_loss)

    model.eval()
    val_running_loss = 0.0
    with torch.no_grad():
        for batch_idx, (images, labels) in enumerate(validloader):
            images = images.to(device)
            labels = [label.to(device) for label in labels]
            outputs = model(images)
            total_loss, lobj, lbox, lcls = compute_loss(
                outputs, labels, anchors, classes_number, device
            )
            val_running_loss += total_loss.item()
    print(f"Validation Loss: {val_running_loss / len(validloader):.4f}")
    valid_epoch_loss = val_running_loss / len(validloader)
    valid_array.append(valid_epoch_loss)

    if min_validation_loss > val_running_loss:
        min_validation_loss = val_running_loss
        print(f"Validation loss improved to {min_validation_loss/len(validloader):.4f}, saving model...")
        torch.save(model.state_dict(), f"yolov5_minecraft_ore_{epoch + 1}.pth")

    run.log({
        "epoch": (epoch + 1) / num_epochs,
        "training_loss": running_loss / len(trainloader),
        "validation_loss": val_running_loss / len(validloader)
    })
    scheduler.step()


# Inference

In [None]:
def load_model(model, model_path):
    """Load a model from a .pth file."""
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.to(device)
    model.eval()
    return model

def decode_predictions(outputs, confidence_thresh=0.1, iou_thresh=0.5):
    """Convert model outputs to usable bounding boxes"""
    all_boxes = []
    anchors = model.detect.anchors.cpu().numpy()
    strides = [8, 16, 32]  # For 640x640 input

    for scale_idx, output in enumerate(outputs):
        # Convert to numpy and get dimensions
        output = output.sigmoid().cpu().detach().numpy()
        bs, num_anchors, h, w, _ = output.shape

        # Convert dimensions to integers
        h = int(h)
        w = int(w)

        # Get parameters for this scale
        stride = strides[scale_idx]
        anchor = anchors[scale_idx]

        # Create grid
        grid_y, grid_x = np.mgrid[:h, :w]

        # Reshape output for vectorized operations
        output = output.reshape(bs, num_anchors, h, w, -1)

        # Decode predictions using vectorized operations
        tx = output[..., 0]
        ty = output[..., 1]
        tw = output[..., 2]
        th = output[..., 3]
        obj = output[..., 4]
        cls_probs = output[..., 5:]

        # Calculate absolute coordinates
        x = (grid_x + tx) * stride
        y = (grid_y + ty) * stride
        anchor_w = torch.from_numpy(anchor[:, 0].reshape(1, -1, 1, 1)).float()
        anchor_h = torch.from_numpy(anchor[:, 1].reshape(1, -1, 1, 1)).float()
        tw_tensor = torch.from_numpy(tw)
        th_tensor = torch.from_numpy(th)
        w = anchor_w * (torch.sigmoid(tw_tensor) * 2) ** 2
        h = anchor_h * (torch.sigmoid(th_tensor) * 2) ** 2
        w = w.numpy()
        h = h.numpy()

        # Calculate class confidence
        class_ids = np.argmax(cls_probs, axis=-1)
        class_conf = np.take_along_axis(cls_probs, class_ids[..., None], axis=-1).squeeze(-1)
        confidence = obj * class_conf

        # Filter by confidence threshold
        mask = confidence > confidence_thresh
        for batch_idx in range(bs):
            batch_mask = mask[batch_idx]
            batch_boxes = np.stack([
                x[batch_idx][batch_mask] - w[batch_idx][batch_mask]/2,
                y[batch_idx][batch_mask] - h[batch_idx][batch_mask]/2,
                x[batch_idx][batch_mask] + w[batch_idx][batch_mask]/2,
                y[batch_idx][batch_mask] + h[batch_idx][batch_mask]/2,
                confidence[batch_idx][batch_mask],
                class_ids[batch_idx][batch_mask]
            ], axis=-1)

            if batch_boxes.size > 0:
                all_boxes.extend(batch_boxes.tolist())

    # Non-Maximum Suppression
    if not all_boxes:
        return []

    boxes = np.array(all_boxes)
    x1, y1, x2, y2, scores, class_ids = boxes.T

    # Calculate areas and sort
    areas = (x2 - x1) * (y2 - y1)
    order = scores.argsort()[::-1]
    keep = []

    while order.size > 0:
        i = order[0]
        keep.append(i)

        # Calculate overlaps
        xx1 = np.maximum(x1[i], x1[order[1:]])
        yy1 = np.maximum(y1[i], y1[order[1:]])
        xx2 = np.minimum(x2[i], x2[order[1:]])
        yy2 = np.minimum(y2[i], y2[order[1:]])

        w = np.maximum(0.0, xx2 - xx1)
        h = np.maximum(0.0, yy2 - yy1)
        intersection = w * h

        iou = intersection / (areas[i] + areas[order[1:]] - intersection)

        # Filter boxes
        inds = np.where(iou <= iou_thresh)[0]
        order = order[inds + 1]

    return boxes[keep].tolist()

def test_single_image(model, dataset, index=0):
    # Get image and labels
    image, true_labels = dataset[index]
    true_labels = true_labels.cpu().numpy()

    # Run inference
    model.eval()
    with torch.no_grad():
        outputs = model(image.unsqueeze(0).to(device))

    # Decode predictions
    pred_boxes = decode_predictions(outputs)
    print(pred_boxes)

    # Convert true labels to box format
    true_boxes = []
    img_w, img_h = 640, 640  # Our image size
    for label in true_labels:
        class_id, xc, yc, bw, bh = label
        x = (xc - bw/2) * img_w
        y = (yc - bh/2) * img_h
        w = bw * img_w
        h = bh * img_h
        true_boxes.append([x, y, x+w, y+h, 1.0, class_id])

    # Visualize
    image_np = image.permute(1, 2, 0).cpu().numpy()
    fig, ax = plt.subplots(1, figsize=(10, 10))
    ax.imshow(image_np)

    # Draw true boxes (green)
    for box in true_boxes:
        x1, y1, x2, y2, _, class_id = box
        rect = patches.Rectangle(
            (x1, y1), x2-x1, y2-y1,
            linewidth=2, edgecolor='lime', facecolor='none'
        )
        ax.add_patch(rect)
        ax.text(x1, y1-5, classes_types[int(class_id)],
                color='white', fontsize=10,
                bbox=dict(facecolor='lime', alpha=0.8, pad=1))

    # Draw predicted boxes (red)
    for box in pred_boxes:
        x1, y1, x2, y2, conf, class_id = box
        rect = patches.Rectangle(
            (x1, y1), x2-x1, y2-y1,
            linewidth=2, edgecolor='red', facecolor='none'
        )
        ax.add_patch(rect)
        ax.text(x1, y1-5, f"{classes_types[int(class_id)]} {conf:.2f}",
                color='white', fontsize=10,
                bbox=dict(facecolor='red', alpha=1, pad=1))

    plt.axis('off')
    plt.show()

In [None]:
model_path = "yolov5_minecraft_ore_100.pth"
if not os.path.exists(model_path):
    print(f"Error: Model file '{model_path}' does not exist.")
model_from_disk = load_model(model, model_path)

random_index = random.randint(0, len(mc_test) - 1)
test_single_image(model_from_disk, mc_test, index=random_index)


# Evaluation

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

def evaluate_model(model, dataloader, class_names, iou_thresh=0.5, conf_thresh=0.1):
    model.eval()
    all_true = []
    all_pred = []

    for images, labels in dataloader:
        images = images.to(device)
        with torch.no_grad():
            outputs = model(images)
        batch_size = images.size(0)
        for i in range(batch_size):
            # True labels for this image
            true = labels[i].cpu().numpy()
            true_classes = true[:, 0].astype(int) if len(true) > 0 else np.array([], dtype=int)
            all_true.append(true_classes)

            # Predicted boxes for this image
            pred_boxes = decode_predictions([out[i:i+1] for out in outputs], confidence_thresh=conf_thresh, iou_thresh=iou_thresh)
            pred_classes = np.array([int(box[5]) for box in pred_boxes]) if len(pred_boxes) > 0 else np.array([], dtype=int)
            all_pred.append(pred_classes)

    # Flatten all_true and all_pred for each class
    metrics = {}
    for class_id, class_name in class_names.items():
        y_true = []
        y_pred = []
        for t, p in zip(all_true, all_pred):
            # For each image, mark 1 if class present, else 0
            y_true.append(int(class_id in t))
            y_pred.append(int(class_id in p))
        precision = precision_score(y_true, y_pred, zero_division=0)
        recall = recall_score(y_true, y_pred, zero_division=0)
        f1 = f1_score(y_true, y_pred, zero_division=0)
        metrics[class_name] = {'precision': precision, 'recall': recall, 'f1': f1}

    for class_name, vals in metrics.items():
        print(f"{class_name}: Precision={vals['precision']:.3f}, Recall={vals['recall']:.3f}, F1={vals['f1']:.3f}")

# Evaluate on test set
evaluate_model(model, testloader, classes_types)
