In [2]:
'''
1. download zip files via:
 https://www.cityscapes-dataset.com/dataset-overview/
 gtFine_trainvaltest.zip (241MB) [md5], leftImg8bit_trainvaltest.zip (11GB) [md5]

2. manually unzip or run codes to unzip files, as:

from torchvision.datasets import Cityscapes as cityscapes  # will unzip the file when run at first time
import os
root ='/home/royliu/Documents/datasets/'
data_dir ='cityscapes'
data_dir = os.path.join(root, data_dir)
dataset = cityscapes(data_dir, split = 'train', mode ='fine', target_type = 'semantic')

will get data in below structure

├── gtFine
│   ├── train
│   │   ├── aachen
│   │   ├── bochum
│   │   └── ......
│   └── val
│       └── frankfurt
└── leftImg8bit
    ├── train
    │   ├── aachen
    │   ├── bochum
    │   └── ......
    └── val
        └── frankfurt


output:  4 dimentions ndarray of: mask, image
'''

import cv2
import numpy as np
import os
import argparse
import torchvision.transforms as transforms
from torchvision.datasets import Cityscapes
from PIL import Image
import torch

parser = argparse.ArgumentParser()
parser.add_argument('--root', metavar='root', default='/home/royliu/Documents/datasets/cityscapes')
parser.add_argument('--split', metavar='split', default='eval', help='"train" or "eval".')

def cv_to_pil(img_cv): # convert cv2 to PIL format
    img_pil = Image.fromarray(cv2.cvtColor(img_cv, cv2.COLOR_BGR2RGB))
    # img_pil = Image.fromarray(img_cv)  # not convert RGB sequence
    return img_pil


class CityTransform:
    def __call__(self, image, mask):
        image = cv_to_pil(image)  # to satisefy transforms.RandomResizedCrop's input requirement in shape of (...,h,w)
        transform_img = transforms.Compose([
                                transforms.RandomResizedCrop((256, 512)),
                                # transforms.RandomHorizontalFlip(),
                                transforms.ToTensor(),
                                transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])
        transform_mask = transforms.Compose([
                                transforms.RandomResizedCrop((256, 512))])
        return transform_img(image), transform_mask(mask)

class DataGenerator(Cityscapes):  ## varible names in Cityscapes Class are: images, targets, split...
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs, target_type="semantic")
        self.semantic_target_type_index = [i for i, t in enumerate(self.target_type) if t == "semantic"][0]
        self.colormap = self._generate_colormap()
        # self.transform = self._transform()

    def _transform(self, image, mask):
        transform = transforms.Compose([
                                transforms.RandomResizedCrop((256, 512)),
                                # transforms.RandomHorizontalFlip(),
                                transforms.ToTensor(),
                                transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])
        return transform(image), transform(mask)


    def _generate_colormap(self):
        colormap = {}
        for class_ in self.classes:
            if class_.train_id in (-1, 255):
                continue
            colormap[class_.train_id] = class_.id
        return colormap  # return a diction

    def _convert_to_segmentation_mask(self, mask):
        height, width = mask.shape[:2]
        segmentation_mask = np.zeros((height, width, len(self.colormap)), dtype=np.float32)
        for label_index, label in self.colormap.items():
            segmentation_mask[:, :, label_index] = (mask == label).astype(float)
        return segmentation_mask

    def __getitem__(self, index):
        image = cv2.imread(self.images[index])
        # image = cv_to_pil(image)  # transfer shape to channel, h, w
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)  # still in cv2 format
        image = image.transpose(2,0,1)  # now in shape of channel,h,w & RGB

        mask = cv2.imread(self.targets[index][self.semantic_target_type_index], cv2.IMREAD_UNCHANGED)
        mask = self._convert_to_segmentation_mask(mask)
        mask = mask.transpose(2, 0, 1)  # transfer shape to class#, h, w

        # if self.transform :
        #     transformed = self._transform(image=image, mask=mask)
        #     image = transformed["image"]
        #     mask = transformed["mask"]
        return image, mask



# cv2.setNumThreads(0)
# cv2.ocl.setUseOpenCL(False)
# transform = transforms.Compose([
#     transforms.RandomResizedCrop((256, 512)),
#     # transforms.RandomHorizontalFlip(),
#     transforms.ToTensor(),
#     transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])

args = parser.parse_args()
# assert os.path.exists(args.root), 'Root of dataset is incorrect or miss.'
# dataset_train = DataGenerator(args.root, split = 'val', transforms= CityTransform) # default: mode='fine', target_type= 'sementic, split: train, test or val if mode=”fine” otherwise train, train_extra or val
# img_array, sgm = dataset_train[0]
# print(img_array.shape, sgm.shape)
# img_pil = Image.fromarray(img_array, 'RGB')
# img_pil.show()
# print(img.shape, sgm)

usage: ipykernel_launcher.py [-h] [--root root] [--split split]
ipykernel_launcher.py: error: unrecognized arguments: -f /home/royliu/.local/share/jupyter/runtime/kernel-7272269b-491a-48c2-9f3e-bc8e25207f25.json


SystemExit: 2