In [None]:
%matplotlib notebook
import os
import numpy as np
import torch
import torch.utils.data as td
from PIL import Image
from matplotlib import pyplot as plt

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)



In [None]:
import xml.etree.ElementTree as ET
def parse_rec(filename):
    """ Parse a PASCAL VOC xml file """
    tree = ET.parse(filename)
    objects = []
    for obj in tree.findall('object'):
        obj_struct = {}
        obj_struct['name'] = obj.find('name').text
        obj_struct['pose'] = obj.find('pose').text
        obj_struct['truncated'] = int(obj.find('truncated').text)
        obj_struct['difficult'] = int(obj.find('difficult').text)
        bbox = obj.find('bndbox')
        obj_struct['bbox'] = [int(bbox.find('xmin').text),
                              int(bbox.find('ymin').text),
                              int(bbox.find('xmax').text),
                              int(bbox.find('ymax').text)]
        objects.append(obj_struct)

    return objects

In [None]:
dataset_root_dir = "/datasets/ee285f-public/PascalVOC2012/"

class PascalVOCDataset(td.Dataset):
    '''
    Inputs:
        root_dir: Directory of PascalVOC2012 dataset
        mode: "train", "val", "trainval"
        
    Attributes:
    class_dict : dictionary in which classes are encoded
    __len__() : returns length of dataset
    __getitem__(idx) 
        Outputs: (image, mask, objects)
            image: the image as a torch tensor of values between [-1,1] of dimensions (3,h,w)
            mask: a mask of bounding box labels of dimensions(K,h,w) where K is number of classes
            objects: list of the annotation dictionaries with keys {name, pose, truncated, difficult, bbox}. 
                name and bbox (bounding box) are probably the more important ones, which are already used in the mask.
                
    '''
    def __init__(self, root_dir, mode = 'train'):
        super(PascalVOCDataset, self).__init__()
        self.files = {}
        self.mode = mode
        self.class_dict = {"person":0, "bird":1, "cat":2, "cow":3,
                           "dog":4, "horse":5, "sheep":6,
                           "aeroplane":7, "bicycle":8, "boat":9, "bus":10, "car":11,
                           "motorbike":12, "train":13,"bottle":14, "chair":15, 
                           "dining table":16, "potted plant":17, "sofa":18, "tvmonitor":19}
        
        self.num_classes = len(self.class_dict.keys())
        
        for split in ["train", "val", "trainval"]:
            path = os.path.join(root_dir, "ImageSets/Main", split + ".txt")
            file_list = tuple(open(path, "r"))
            file_list = [id_.rstrip() for id_ in file_list]
            self.files[split] = file_list
            
        self.images_dir = os.path.join(root_dir, "JPEGImages")
        self.annot_dir = os.path.join(root_dir, "Annotations")
    
    def __len__(self):
        return len(self.files[self.mode])
    
    def __repr__(self):
        return "PascalVOCDataset(mode={})". \
            format(self.mode)
    
    def __getitem__(self, idx):
        img_path = os.path.join(self.images_dir, \
        self.files[self.mode][idx]+'.jpg')
        
        annot_path = os.path.join(self.annot_dir, \
        self.files[self.mode][idx]+'.xml')
        
        objects = parse_rec(annot_path)
        
        img = Image.open(img_path).convert('RGB')
        transform = tv.transforms.Compose([
        tv.transforms.ToTensor(),
        tv.transforms.Normalize((0.5,0.5,0.5),(0.5,0.5,0.5))
        ])
        
        image = transform(img)
        mask = torch.zeros((self.num_classes,image.shape[1],image.shape[2]))
        for obj in objects:
            obj_idx = self.class_dict[obj['name']]
            bbox = obj['bbox']
            mask[obj_idx][bbox[1]:bbox[3],bbox[0]:bbox[2]] = 1
        return image, mask, objects



In [None]:
train_set = PascalVOCDataset(dataset_root_dir, mode= 'train')
val_set = PascalVOCDataset(dataset_root_dir, mode= 'val')
trainval_set = PascalVOCDataset(dataset_root_dir, mode= 'trainval')

B = 50
train_loader = td.DataLoader(train_set, batch_size = B, pin_memory = True, shuffle = True)
val_loader = td.DataLoader(val_set, batch_size = B, pin_memory = True)


In [None]:
def myimshow(image, ax=plt):
    image = image.to('cpu').numpy()
    image = np.moveaxis(image, [0, 1, 2], [2, 0, 1])
    image = (image + 1) / 2
    image[image < 0] = 0
    image[image > 1] = 1
    h = ax.imshow(image)
    ax.axis('off')
    return h
fig, ax = plt.subplots()
x, mask, objects = train_set.__getitem__(10)
h = myimshow(x, ax)
print(objects[0]['bbox'])

In [9]:
file_list

['2007_000032',
 '2007_000039',
 '2007_000063',
 '2007_000068',
 '2007_000121',
 '2007_000170',
 '2007_000241',
 '2007_000243',
 '2007_000250',
 '2007_000256',
 '2007_000333',
 '2007_000363',
 '2007_000364',
 '2007_000392',
 '2007_000480',
 '2007_000504',
 '2007_000515',
 '2007_000528',
 '2007_000549',
 '2007_000584',
 '2007_000645',
 '2007_000648',
 '2007_000713',
 '2007_000720',
 '2007_000733',
 '2007_000738',
 '2007_000768',
 '2007_000793',
 '2007_000822',
 '2007_000836',
 '2007_000876',
 '2007_000904',
 '2007_001027',
 '2007_001073',
 '2007_001149',
 '2007_001185',
 '2007_001225',
 '2007_001340',
 '2007_001397',
 '2007_001416',
 '2007_001420',
 '2007_001439',
 '2007_001487',
 '2007_001595',
 '2007_001602',
 '2007_001609',
 '2007_001698',
 '2007_001704',
 '2007_001709',
 '2007_001724',
 '2007_001764',
 '2007_001825',
 '2007_001834',
 '2007_001857',
 '2007_001872',
 '2007_001901',
 '2007_001917',
 '2007_001960',
 '2007_002024',
 '2007_002055',
 '2007_002088',
 '2007_002099',
 '2007_0

In [10]:
files = {}

In [11]:
files['train'] = file_list

In [20]:
from PIL import Image
import glob
image_list = []
for filename in glob.glob('/datasets/ee285f-public/PascalVOC2012/SegmentationClass/*.png'): #assuming gif
    image_list.append(filename)

In [None]:
import data
import torchvision.transforms as transforms
dataset_root_dir = "/datasets/ee285f-public/PascalVOC2012/"
download = False

# How do we preprocessing the image (e.g. none, crop, shrink)
image_transform_params = {'image_mode': 'none'}

# How do we preprocess the targets
target_transform_params = {'target_mode': 'preprocessed'}

# The post-processing of the image
image_transform = transforms.ToTensor()

train_dataset, valid_dataset = data.make_trainval_dataset(
        dataset_dir             = dataset_dir,
        image_transform_params  = image_transform_params,
        transform               = image_transform,
        target_transform_params = target_transform_params,
        download                = download)

print(train_dataset[0])


In [27]:
tv.datasets.VOCDetection(dataset_root_dir, year = '2012',image_set='train', download=False)

AttributeError: module 'torchvision.datasets' has no attribute 'VOCDetection'

In [2]:
import torchvision as tv
help(tv.datasets)

Help on package torchvision.datasets in torchvision:

NAME
    torchvision.datasets

PACKAGE CONTENTS
    cifar
    coco
    fakedata
    folder
    lsun
    mnist
    omniglot
    phototour
    semeion
    stl10
    svhn
    utils

CLASSES
    torch.utils.data.dataset.Dataset(builtins.object)
        torchvision.datasets.cifar.CIFAR10
            torchvision.datasets.cifar.CIFAR100
            torchvision.datasets.stl10.STL10
        torchvision.datasets.coco.CocoCaptions
        torchvision.datasets.coco.CocoDetection
        torchvision.datasets.fakedata.FakeData
        torchvision.datasets.folder.DatasetFolder
            torchvision.datasets.folder.ImageFolder
        torchvision.datasets.lsun.LSUN
        torchvision.datasets.lsun.LSUNClass
        torchvision.datasets.mnist.MNIST
            torchvision.datasets.mnist.EMNIST
            torchvision.datasets.mnist.FashionMNIST
        torchvision.datasets.omniglot.Omniglot
        torchvision.datasets.phototour.PhotoTour
       