In [18]:
import sys
sys.path.append("..")

import torch
import torchvision
from PIL import Image
from lib.coco_eval import CocoEvaluator
import random
from PIL import Image
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.transforms.functional import to_pil_image
import torchvision.transforms as T
import numpy as np
import json
from torch.utils.data import Dataset, DataLoader
from torchvision.datasets import CocoDetection
from pycocotools.coco import COCO

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

#Load the category names
NYU40CLASSES = ['void',
                'wall', 'floor', 'cabinet', 'bed', 'chair',
                'sofa', 'table', 'door', 'window', 'bookshelf',
                'picture', 'counter', 'blinds', 'desk', 'shelves',
                'curtain', 'dresser', 'pillow', 'mirror', 'floor_mat',
                'clothes', 'ceiling', 'books', 'refridgerator', 'television',
                'paper', 'towel', 'shower_curtain', 'box', 'whiteboard',
                'person', 'night_stand', 'toilet', 'sink', 'lamp',
                'bathtub', 'bag', 'otherstructure', 'otherfurniture', 'otherprop']

# Create a dictionary that maps category IDs to category names
id_to_name = {i: name for i, name in enumerate(NYU40CLASSES)}

In [2]:
class CustomDataset(Dataset):
    def __init__(self, transform_target, data_list):
        self.data_list = data_list
        self.transform_target = transform_target

    def __getitem__(self, idx):
        data = self.data_list[idx]

        img = Image.open(data['image_path'])
        convert_tensor = T.ToTensor()
        img = convert_tensor(img)

        boxes = torch.tensor(data['boxes'], dtype=torch.float32)
        labels = torch.tensor(data['labels'], dtype=torch.int64)
        # suppose all instances are not crowd
        iscrowd = torch.zeros((len(data['labels']),), dtype=torch.int64)
        area = []
        for i in data['boxes']:
            area.append((i[2]-i[0])*(i[3]-i[1]))

        area = torch.tensor(area,dtype=torch.float32)    

        
        target = {
            'boxes': boxes,
            'labels': labels,
            'image_id' : idx,
            'iscrowd' : iscrowd,
            'area' : area
        }
        
        target = self.transform_target(target)

        return img, target
    
    def __len__(self):
        return len(self.data_list) 
    
def arrayFromSUNRGBD(pathToJSON):
    cocoDict = json.load(open(pathToJSON))
    
    listDict = []
    # Create a dictionary mapping image IDs to file names
    image_paths = {image["id"]: image["file_name"] for image in cocoDict["images"]}
    
    # Group annotations by image ID
    annotations_by_image = {}
    for annotation in cocoDict["annotations"]:
        if annotation["image_id"] not in annotations_by_image:
            annotations_by_image[annotation["image_id"]] = []
        annotations_by_image[annotation["image_id"]].append(annotation)
    
    for i in image_paths.keys():
        if i in annotations_by_image:
            data_sample = {}
            data_sample["image_path"] = image_paths[i]
            data_sample["boxes"] = [[annotation["bbox"][0], annotation["bbox"][1], annotation["bbox"][2] + annotation["bbox"][0], annotation["bbox"][3] + annotation["bbox"][1]] for annotation in annotations_by_image[i]]
            data_sample["labels"] = [annotation["category_id"] for annotation in annotations_by_image[i]]
            listDict.append(data_sample)
    
    return listDict   

def get_transform(train=True):
    image_transforms = []
    target_transforms = []
    if train:
        image_transforms.append(T.RandomHorizontalFlip(p=0.5))
    image_transforms.append(T.ConvertImageDtype(torch.float32))
    image_transforms.append(T.ToTensor())
    return T.Compose(image_transforms), T.Compose(target_transforms)


def calculatemAP(coco_evaluation):
    list_eval = coco_evaluation.coco_eval['bbox'].stats.tolist()
    ap_values = []
    ap_values.append(list_eval[0])
    ap_values.append(list_eval[3])
    ap_values.append(list_eval[4])
    ap_values.append(list_eval[5])
    mAP = np.average(ap_values)
    return mAP

def calculatemAR(coco_evaluation):
    list_eval = coco_evaluation.coco_eval['bbox'].stats.tolist()
    ap_values = []
    ap_values.append(list_eval[6])
    ap_values.append(list_eval[7])
    ap_values.append(list_eval[8])
    ap_values.append(list_eval[9])
    ap_values.append(list_eval[10])
    ap_values.append(list_eval[11])
    mAR = np.average(ap_values)
    return mAR

In [23]:
def get_model_resnet(num_classes, state_dict):
    model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
    model.load_state_dict(torch.load(state_dict))
    model.eval()
    return model

def get_model_mobilenet(num_classes, state_dict):
    model = torchvision.models.detection.fasterrcnn_mobilenet_v3_large_fpn(pretrained=True)
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
    model.load_state_dict(torch.load(state_dict))
    model.eval()
    return model

def get_model_detr(state_dict):
    # Load the pre-trained model with the original number of classes
    model = torch.hub.load('facebookresearch/detr:main', 'detr_resnet50', pretrained=False)

    # Define a new classification head
    num_in_features = model.class_embed.in_features
    num_output_features = len(NYU40CLASSES) + 1 # Define your number of output features here

    # Replace the classification head
    model.class_embed = torch.nn.Linear(num_in_features, num_output_features)
    model.load_state_dict(torch.load(state_dict))
    model.eval()
    return model

# standard PyTorch mean-std input image normalization
transform = T.Compose([
    T.Resize(800),
    T.ToTensor(),
    T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

# for output bounding box post-processing
def box_cxcywh_to_xyxy(x):
    x_c, y_c, w, h = x.unbind(1)
    b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
        (x_c + 0.5 * w), (y_c + 0.5 * h)]
    return torch.stack(b, dim=1)

def rescale_bboxes(out_bbox, size):
    img_w, img_h = size
    b = box_cxcywh_to_xyxy(out_bbox)
    b = b * torch.tensor([img_w, img_h, img_w, img_h], dtype=torch.float32)
    return b

def detect(im, model, transform):
    # mean-std normalize the input image (batch-size: 1)
    im = to_pil_image(im)
    img = transform(im).unsqueeze(0)

    # demo model only support by default images with aspect ratio between 0.5 and 2
    # if you want to use images with an aspect ratio outside this range
    # rescale your image so that the maximum size is at most 1333 for best results
    assert img.shape[-2] <= 1600 and img.shape[-1] <= 1600, 'demo model only supports images up to 1600 pixels on each side'

    # propagate through the model
    outputs = model(img)

    # keep only predictions with 0.7+ confidence
    probas = outputs['pred_logits'].softmax(-1)[0, :, :-1]
    keep = probas.max(-1).values > 0.9

    # convert boxes from [0; 1] to image scales
    bboxes_scaled = rescale_bboxes(outputs['pred_boxes'][0, keep], im.size)
    
    predictions = []
    for p, (xmin, ymin, xmax, ymax) in zip(probas, bboxes_scaled):
        prediction = {
            "boxes": torch.tensor([xmin, ymin, xmax, ymax]),
            "labels": torch.tensor([p.argmax()]),
            "scores": torch.tensor([p.max()]),
        }
        predictions.append(prediction)

    return predictions

def evaluate_model(model, data_loader, device):
    # put the model in evaluation mode
    model.eval()

    # create a CocoEvaluator
    coco_evaluator = CocoEvaluator(data_loader.dataset.coco, iou_types=["bbox"])

    with torch.no_grad():
        for images, targets in data_loader:
            images = list(image.to(device) for image in images)
            targets = [{k: v.to(device) if torch.is_tensor(v) else v for k, v in t.items()} for t in targets]

            # get the model's predictions
            predictions = model(images)

            # convert predictions to the correct format
            predictions = {target["image_id"].item(): prediction for target, prediction in zip(targets, predictions)}

            # update the evaluator with the model's predictions
            coco_evaluator.update(predictions)

    # compute the metrics
    coco_evaluator.synchronize_between_processes()
    coco_evaluator.accumulate()
    coco_evaluator.summarize()

    # get the mAP and mAR
    stats = coco_evaluator.coco_eval['bbox'].stats
    mAP = stats[0:5].mean()
    mAR = stats[5:10].mean()

    return mAP, mAR

transform = T.Compose([
    T.Resize(800),
    T.ToTensor(),
    T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

def evaluate_detr(model, data_loader, device):
    # put the model in evaluation mode
    model.eval()

    # create a CocoEvaluator
    coco_evaluator = CocoEvaluator(data_loader.dataset.coco, iou_types=["bbox"])

    with torch.no_grad():
        for images, targets in data_loader:
            images = list(image.to(device) for image in images)
            targets = [{k: v.to(device) if torch.is_tensor(v) else v for k, v in t.items()} for t in targets]

            # get the model's predictions
            predictions = [detect(image, model, transform) for image in images]

            # convert predictions to the correct format
            predictions = {target["image_id"].item(): prediction for target, prediction in zip(targets, predictions)}

            # update the evaluator with the model's predictions
            coco_evaluator.update(predictions)

    # compute the metrics
    coco_evaluator.synchronize_between_processes()
    coco_evaluator.accumulate()
    coco_evaluator.summarize()

    # get the mAP and mAR
    stats = coco_evaluator.coco_eval['bbox'].stats
    mAP = stats[0:5].mean()
    mAR = stats[5:10].mean()

    return mAP, mAR



In [21]:
transform_image, transform_target = get_transform(train=True)

test_dataset_COCO = COCO('../data/annotations/test_labels.json')
coco_dataset = CocoDetection(root='../data/images/', annFile='../data/annotations/test_labels.json', transform=T.ToTensor())
data_loader = DataLoader(coco_dataset, batch_size=1, shuffle=False)

resnet = get_model_resnet(41, 'resnet.pth')
resnet.to(device)

mobilenet = get_model_mobilenet(41, 'mobilenet.pth')
mobilenet.to(device)

detr = get_model_detr('detr.pth')

loading annotations into memory...
Done (t=0.01s)
creating index...
index created!
loading annotations into memory...
Done (t=0.01s)
creating index...
index created!


Using cache found in /home/gijsf/.cache/torch/hub/facebookresearch_detr_main


In [5]:
mAP_resnet, mAR_resnet = evaluate_model(resnet, data_loader, device)
print(f"ResNet: mAP = {mAP_resnet}, mAR = {mAR_resnet}")

  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


Accumulating evaluation results...
DONE (t=0.99s).
IoU metric: bbox
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.188
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.312
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.194
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.466
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.164
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.192
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.349
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.467
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.469
 Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.483
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.354
 Average Recall     (AR) @[ IoU=0.50:0.95 | area= la

In [6]:
mAP_mobilenet, mAR_mobilenet = evaluate_model(mobilenet, data_loader, device)
print(f"MobileNet: mAP = {mAP_mobilenet}, mAR = {mAR_mobilenet}")

Accumulating evaluation results...
DONE (t=0.82s).
IoU metric: bbox
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.108
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.188
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.107
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.279
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.108
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.108
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.206
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.271
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.274
 Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.286
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.220
 Average Recall     (AR) @[ IoU=0.50:0.95 | area= la

In [24]:
mAP_detr, mAR_detr = evaluate_detr(detr, data_loader, device)
print(f"DETR: mAP = {mAP_detr}, mAR = {mAR_detr}")

TypeError: list indices must be integers or slices, not str