In [None]:
import io
import json
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import numpy as np
import os
import os.path as osp
import torch
import torch.nn as nn
from mmseg.core.evaluation import mean_iou
import sys

sys.path.append('../')
from contextlib import redirect_stdout
from easydict import EasyDict
from models.builder import EncoderDecoder
from utils.pyt_utils import load_model
from utils.transforms import normalize
from eval import get_matterport3d_pan_pipeline as get_eval_pipeline
from dataloader.dataloader import get_matterport3d_pan_pipeline as get_train_pipeline

trap = io.StringIO()


# Matterport3D RGB-D Panoramic

In [None]:
device = 'cuda:0'
config = EasyDict()
save_figures = False
resolution = 50

config.root = '<sfss_mmsi_path>' # TODO: change this to your own path
config.dataset_path = osp.join(config.root, 'datasets', 'Matterport3D-1K')
config.dataset_name = 'Matterport3D'
config.ignore_index = 255
config.image_height = 512
config.image_width = 512
config.norm_mean = np.array([0.485, 0.456, 0.406])
config.norm_std = np.array([0.229, 0.224, 0.225])

config.backbone = 'trio_mit_b2' # TODO: change to 'mit_b2', 'dual_mit_b2', 'trio_mit_b2'
config.pretrained_model = osp.join(config.root, 'pretrained', 'segformers/mit_b2.pth')
config.decoder = 'DMLPDecoderV2'
config.decoder_embed_dim = 512
config.optimizer = 'AdamW'
config.use_dcns = [True, False, False, False]

config.batch_size = 1
config.rgb = 'rgb-1K'
config.ann = 'semantic-1K'
config.modality_x = ['depth-1K'] # TODO: change to 'depth-1K', 'normal-1K'
config.train_source = osp.join(config.dataset_path, 'train.txt')
config.eval_source = osp.join(config.dataset_path, 'validation.txt')
config.test_source = osp.join(config.dataset_path, 'test.txt')
config.train_scale_array = [0.5, 0.75, 1, 1.25, 1.5, 1.75]
config.num_classes = 41
if config.backbone == 'mit_b2':
    config.log_dir = os.path.abspath(osp.join(config.root, 'workdirs', 'Matterport3D_1024x512',
                                              'log_' + config.dataset_name + '_' + config.backbone + '_DMLPDecoderV2'))
elif config.backbone == 'dual_mit_b2' and config.modality_x[0] == 'depth-1K':
    config.log_dir = os.path.abspath(osp.join(config.root, 'workdirs', 'Matterport3D_1024x512',
                                              'log_' + config.dataset_name + '_' + config.backbone + '_DMLPDecoderV2_Depth'))
elif config.backbone == 'dual_mit_b2' and config.modality_x[0] == 'normal-1K':
    config.log_dir = os.path.abspath(osp.join(config.root, 'workdirs', 'Matterport3D_1024x512',
                                              'log_' + config.dataset_name + '_' + config.backbone + '_DMLPDecoderV2_Normal'))
elif config.backbone == 'trio_mit_b2':
    config.log_dir = os.path.abspath(osp.join(config.root, 'workdirs', 'Matterport3D_1024x512',
                                              'log_' + config.dataset_name + '_' + config.backbone + '_DMLPDecoderV2_Depth_Normal'))
else:
    raise NotImplementedError
config.checkpoint_pth = os.path.join(os.path.abspath(os.path.join(config.log_dir, 'checkpoint')), 'epoch-best.pth')


In [None]:
def process_eval_image_rgbX(image, modal_x1, modal_x2, norm_mean, norm_std):
    image = normalize(image, norm_mean, norm_std)
    image = image.transpose(2, 0, 1)
    image = np.ascontiguousarray(image[None, :, :, :], dtype=np.float32)
    image = torch.FloatTensor(image).cuda(device)

    modal_x1 = normalize(modal_x1, norm_mean, norm_std)
    modal_x1 = modal_x1.transpose(2, 0, 1)
    modal_x1 = np.ascontiguousarray(modal_x1[None, :, :, :], dtype=np.float32)
    modal_x1 = torch.FloatTensor(modal_x1).cuda(device)

    modal_x2 = normalize(modal_x2, norm_mean, norm_std)
    modal_x2 = modal_x2.transpose(2, 0, 1)
    modal_x2 = np.ascontiguousarray(modal_x2[None, :, :, :], dtype=np.float32)
    modal_x2 = torch.FloatTensor(modal_x2).cuda(device)
    
    return image, modal_x1, modal_x2

def process_train_image_rgbX(image, modal_x1, modal_x2, norm_mean, norm_std):
    image = np.ascontiguousarray(image[None, :, :, :], dtype=np.float32)
    image = torch.FloatTensor(image).cuda(device)

    modal_x1 = np.ascontiguousarray(modal_x1[None, :, :, :], dtype=np.float32)
    modal_x1 = torch.FloatTensor(modal_x1).cuda(device)

    modal_x2 = np.ascontiguousarray(modal_x2[None, :, :, :], dtype=np.float32)
    modal_x2 = torch.FloatTensor(modal_x2).cuda(device)
    
    return image, modal_x1, modal_x2


In [None]:
# structured3d data
print('data streaming.....')
valid_pipeline = get_eval_pipeline(config, split_name='validation')
process_image_rgbX = process_eval_image_rgbX
config.eval_crop_size = [512, 1024]  # [height weight]

# valid_pipeline = get_train_pipeline(config)
# process_image_rgbX = process_train_image_rgbX
# config.eval_crop_size = [512, 512]  # [height weight]

valid_data_itr = iter(valid_pipeline)
valid_labels = np.arange(config.num_classes).tolist() + [config.ignore_index]
with open(os.path.join(config.dataset_path, 'assets/colors.npy'), 'rb') as f:
    seg_colors = np.load(f)
with open(os.path.join(config.dataset_path, 'assets/name2label.json'), 'r') as f:
    name2id = json.load(f)


In [None]:
#
for _ in range(2):
    batch = next(valid_data_itr)
image = batch[config.rgb]
target = batch[config.ann]
modal_x1 = batch[config.modality_x[0]]
modal_x2 = batch[config.modality_x[1]] if len(config.modality_x) == 2 else modal_x1
filename = batch['sample_token']
print(filename)

image, modal_x1, modal_x2 = process_image_rgbX(image, modal_x1, modal_x2, config.norm_mean, config.norm_std)
target = torch.LongTensor(target.astype('int32')).unsqueeze(0).cuda(device)
assert set(torch.unique(target).tolist()).issubset(valid_labels), 'Unknown target label'

# visualize input
if config.backbone == 'mit_b2':
    fig, axs = plt.subplot_mosaic(
        [['RGB'], ['GT']], figsize=(15, 12), layout='constrained')
elif config.backbone == 'dual_mit_b2' and config.modality_x[0] == 'depth-1K':
    fig, axs = plt.subplot_mosaic(
        [['RGB'], ['Depth'], ['GT']], figsize=(15, 18), layout='constrained')
elif config.backbone == 'dual_mit_b2' and config.modality_x[0] == 'normal-1K':
    fig, axs = plt.subplot_mosaic(
        [['RGB'], ['NOR'], ['GT']], figsize=(15, 18), layout='constrained')
elif config.backbone == 'trio_mit_b2':
    fig, axs = plt.subplot_mosaic(
        [['RGB'], ['Depth'], ['NOR'], ['GT']], figsize=(15, 24), layout='constrained')
else:
    raise NotImplementedError

img = image.squeeze().permute(1, 2, 0).cpu().numpy() * config.norm_std + config.norm_mean
img = (img * 255.0).astype('uint8')
plt_img1 = axs['RGB'].imshow(img)
axs['RGB'].set_axis_off()
axs['RGB'].set_title('RGB Input')

if config.backbone != 'mit_b2' and 'depth-1K' in config.modality_x:
    dep = modal_x1.squeeze().permute(1, 2, 0).cpu().numpy() * config.norm_std + config.norm_mean
    # dep = (dep * 255.0).astype('uint8')
    # dep = np.where(dep[..., 0] == 255, 0.0, dep[..., 0] / 255.0) * 16.384
    dep = np.where(dep[..., 0] == 1.0, 0.0, dep[..., 0]) * 16.384
    plt_img2 = axs['Depth'].imshow(dep, vmin=0, vmax=10, cmap='jet')
    axs['Depth'].set_axis_off()
    axs['Depth'].set_title('Depth Input')

if config.backbone != 'mit_b2' and 'normal-1K' in config.modality_x:
    nor = modal_x2.squeeze().permute(1, 2, 0).cpu().numpy() * config.norm_std + config.norm_mean
    nor = (nor * 255.0).astype('uint8')
    plt_img3 = axs['NOR'].imshow(nor)
    axs['NOR'].set_axis_off()
    axs['NOR'].set_title('Normal Input')

groundtruth = target.long() + 1
gt = groundtruth.squeeze().cpu().numpy().astype('uint8')
axs['GT'].imshow(seg_colors[gt])
axs['GT'].set_axis_off()
axs['GT'].set_title('Semantic GT')

patches = [
    mpatches.Patch(color=seg_colors[seg_val], label=seg_lbl)
    for seg_lbl, seg_val in name2id.items()
]

plt.legend(handles=patches, loc='lower center', ncol=7)

plt.show()


In [None]:
# create network
network = EncoderDecoder(cfg=config, criterion=None, norm_layer=nn.BatchNorm2d)
model = load_model(network, config.checkpoint_pth).to(device)

# redirect stdout
with redirect_stdout(trap):
    model.eval()


In [None]:
# predict
assert list(image.shape[-2:]) == config.eval_crop_size
assert list(modal_x1.shape[-2:]) == config.eval_crop_size
assert list(modal_x2.shape[-2:]) == config.eval_crop_size
with torch.no_grad():
    if config.backbone == 'mit_b2' or len(config.modality_x) == 1:
        score = model.forward(image, modal_x1)
    elif len(config.modality_x) == 2:
        score = model.forward(image, modal_x1, modal_x2)
    else:
        raise NotImplementedError

output = torch.exp(score)
iou_result = mean_iou(results=output.argmax(1).cpu().numpy(), gt_seg_maps=target.cpu().numpy(),
                      num_classes=config.num_classes, ignore_index=config.ignore_index, nan_to_num=None,
                      label_map=dict(), reduce_zero_label=False)

id2class = ['void', 'wall', 'floor', 'chair', 'door', 'table', 'picture', 'cabinet', 'cushion', 'window',
            'sofa', 'bed', 'curtain', 'chest of drawers', 'plant', 'sink', 'stairs', 'ceiling', 'toilet',
            'stool', 'towel', 'mirror', 'tv monitor', 'shower', 'column', 'bathtub', 'counter', 'fireplace',
            'lighting', 'beam', 'railing', 'shelving', 'blinds', 'gym equipment', 'seating', 'board panel',
            'furniture', 'appliances', 'clothes', 'objects', 'misc']
for name, iou, acc in zip(id2class, iou_result['IoU'], iou_result['Acc']):
    print(f'{name:20s}:    iou {iou*100:5.3f}    /    acc {acc*100:5.3f}')

print('Eval mAcc: {:.3f}, aAcc: {:.3f}, mIoU: {:.3f}'.format(np.nanmean(iou_result['Acc']) * 100,
                                                             iou_result['aAcc'] * 100,
                                                             np.nanmean(iou_result['IoU']) * 100))
miou = round(np.nanmean(iou_result['IoU']) * 100, 3)


In [None]:
# visualize prediction

fig, axs = plt.subplot_mosaic(
    [['GT'], ['Pred']], figsize=(15, 12), layout='constrained')

axs['GT'].imshow(seg_colors[gt])
axs['GT'].set_axis_off()
axs['GT'].set_title('Semantic GT')

predict = torch.argmax(output.long(), dim=1) + 1
pred = predict.squeeze().cpu().numpy().astype('uint8')
unlabeled = np.array(config.ignore_index + 1).astype(np.uint8)
if True:
    pred[img.sum(-1) == 0] = unlabeled  # mask as unknown id: 0
# pred[gt == unlabeled] = unlabeled  # mask as unknown id
axs['Pred'].imshow(seg_colors[pred])
axs['Pred'].set_axis_off()
axs['Pred'].set_title('Semantic Prediction')

patches = [
    mpatches.Patch(color=seg_colors[seg_val], label=seg_lbl)
    for seg_lbl, seg_val in name2id.items()
]

plt.legend(handles=patches, loc='lower center', ncol=7)

plt.show()


In [None]:
if save_figures:
    # save input(s)
    fig = plt.figure(figsize=(15, 10), layout='constrained', dpi=resolution)
    plt.imshow(img)
    plt.axis('off')
    plt.savefig(f'{filename.replace("tar/", "")}_mp3d_rgb_img.png', bbox_inches='tight', format='png')

    if config.backbone != 'mit_b2' and 'depth-1K' in config.modality_x:
        fig = plt.figure(figsize=(15, 10), layout='constrained', dpi=resolution)
        plt.imshow(dep, vmin=0, vmax=10, cmap='jet')
        plt.axis('off')
        plt.savefig(f'{filename.replace("tar/", "")}_mp3d_dep_img.png', bbox_inches='tight', format='png')

    if config.backbone != 'mit_b2' and 'normal-1K' in config.modality_x:
        fig = plt.figure(figsize=(15, 10), layout='constrained', dpi=resolution)
        plt.imshow(nor)
        plt.axis('off')
        plt.savefig(f'{filename.replace("tar/", "")}_mp3d_nor_img.png', bbox_inches='tight', format='png')

    # save gt
    fig = plt.figure(figsize=(15, 10), layout='constrained', dpi=resolution)
    plt.imshow(seg_colors[gt])
    plt.axis('off')
    plt.savefig(f'{filename.replace("tar/", "")}_mp3d_gt_img.png', bbox_inches='tight', format='png')
    
    # save prediction
    fig = plt.figure(figsize=(15, 10), layout='constrained', dpi=resolution)
    plt.imshow(seg_colors[pred])
    plt.axis('off')
    plt.savefig(f'{filename.replace("tar/", "")}_mp3d_pred{miou}_img.png', bbox_inches='tight', format='png')
else:
    pass
