<a href="https://colab.research.google.com/github/ssnirgudkar/Datasetpaper-final/blob/main/Datasetload_%26_augmentation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount = True)
!cd "drive/My Drive/IRDatasetFinal"

Mounted at /content/drive


In [2]:
import sys
sys.path.append('/content/drive/MyDrive/PythonLibraryforIRDatasetFinal')

In [3]:
!pip3 install segments-ai --upgrade 

Collecting segments-ai
  Downloading segments-ai-0.54.tar.gz (12 kB)
Building wheels for collected packages: segments-ai
  Building wheel for segments-ai (setup.py) ... [?25l[?25hdone
  Created wheel for segments-ai: filename=segments_ai-0.54-py3-none-any.whl size=14681 sha256=d0fb1c19b8197cde62239237a1a208bdc77deddc280f13f33b6377bf99b5ac5b
  Stored in directory: /root/.cache/pip/wheels/6f/b2/ab/66f70f1fe800500afc6d43870988dd7a2feb4e3ce325459ede
Successfully built segments-ai
Installing collected packages: segments-ai
Successfully installed segments-ai-0.54


In [4]:
''' This script downloads a dataset from segments.ai and creates a dataset ready for any machine learning
model. It performs several steps -
1. Download images from segments.ai
2. Move mask images in their own directory
2. Resize images and masks
3. Relabel mask images from instance segmentation to semantic segmentation scheme.
4. Convert IR images to color
5. Increase data by : mirror, rotate (+-3, +-6, +-9), brightness (+-5, +-10, +-15), blur?
6. Configure split (take parameter) (training:validation:testing::80/10/10)
7. Run validation script - Check image pixels, file existence, correct labels, check abrupt change (to black pixels)
print message if there is an error.
## Please make sure that you install latest segments.ai because they keep changing API arguments
## From command shell (administrative) pip install segments-ai --upgrade

@TODO : Support COCO format JSON file reading which gets downloaded when API is used.
'''
import os
import sys
import argparse

import DownloadSegmentedImages
import ArrangeMaskFiles
import randomiseFileName
import ProcessMaskForSemanticSegmentation
import Imageutilities
import ValidateInputData
import shutil
from segments import SegmentsClient, SegmentsDataset

class DatasetCreator:

    def __init__(self, account_name, datasets_identifier, version, userlabel, export_format, new_image_size, mode, outputdatadir, file_test_images_list):
        self.userlabel = userlabel # 'ground-truth' or 'segmentation'
        self.account_name = account_name
        self.datasets_identifier = datasets_identifier
        self.version = version
        self.new_image_size = new_image_size
        self.segmentation_image_dir = None
        self.export_format = export_format
        self.initial_sub_dir = self.version + "_" + self.export_format
        self.mode = mode
        self.file_test_images_list = file_test_images_list
        if (mode != "downloadonly"):
            if (outputdatadir is None):
                print("In augmentonly mode, you must specify top level dataset directory")
                sys.exit(1)
            self.destination_dir = outputdatadir
            self.destination_image_dir = os.path.join(outputdatadir, "images")
            self.destination_segmentation_image_dir = os.path.join(outputdatadir, "masks")

        self.datasets = {}
        # Initialize a SegmentsDataset from the release file
        if (account_name == "ssnirgudkar"):
            client = SegmentsClient('a89182567b17766b91773021b18d04574cd75109')
        if (account_name == "brunswick"):
            client = SegmentsClient('98e631fa8e12a9a4d998b999d6f8e25aa2feb358')
        if (datasets_identifier == "all"):
            datasets = client.get_datasets()
            for adataset in datasets:
                if (adataset["name"] == "playground"):
                    continue
                self.datasets[adataset["name"]] = {}
                top_dir = account_name + '_' + adataset["name"]
                self.datasets[adataset["name"]]['image_dir'] = os.path.join('segments', top_dir, self.initial_sub_dir)
                print("image_dir for {0}={1}".format(adataset["name"], self.datasets[adataset["name"]]['image_dir']))
        else:    
            datasets = self.datasets_identifier.split(',')
            for adataset in datasets:
                self.datasets[adataset] = {}
                top_dir = account_name + '_' + adataset
                self.datasets[adataset]['image_dir'] = os.path.join('segments', top_dir, self.initial_sub_dir)
                print("image_dir for {0}={1}".format(adataset, self.datasets[adataset]['image_dir']))

        # create a nested dictionary 
        # self.datasets{'identifier'}
        #             {'json_file'} => XYZ.json
        #             {'image_dir'} => v1_semantic
        #             {'mask_dir'}  => semantic_XXX


    def downloadDataset(self, aDatasetIdentifier):
        
        dictForDownloadscript = {'account_name':self.account_name, 'dataset_path':aDatasetIdentifier, 
                                 'version':self.version, 'export_format': self.export_format, 'userlabel': self.userlabel}
        json_file = DownloadSegmentedImages.main(dictForDownloadscript)
        print("json_file={0}".format(json_file))
        self.datasets[aDatasetIdentifier]['json_file'] = json_file
        # find name of json file in subdirectory of segments.
        rootDir = 'segments/'
        dir, json_raw_file_name = os.path.split(json_file[0])
        print("dir={0}, json_raw_file_name={1}".format(dir, json_raw_file_name))
        json_raw_file_name_without_ext = json_raw_file_name[:-5]
        print("json_raw_file_name_without_ext={0}".format(json_raw_file_name_without_ext))
        # JSON file name is made up of XXX-VYY.json, find last occurrence of '-' in string so that
        # it can be split as XXX VYY
        pos = json_raw_file_name_without_ext.rfind('-')
        first_part_json_raw_file_name_without_ext = json_raw_file_name_without_ext[0:pos]
        second_part_json_raw_file_name_without_ext = json_raw_file_name_without_ext[pos+1:]
        assert first_part_json_raw_file_name_without_ext != ".", "JSON File splitting got messed up"
        print("first_part_json_raw_file_name_without_ext = {0}".format(first_part_json_raw_file_name_without_ext))
        print("second_part_json_raw_file_name_without_ext = {0}".format(second_part_json_raw_file_name_without_ext))

        for fileName in os.listdir(rootDir):
            subdir = os.path.join(rootDir, fileName)
            if (os.path.isdir(subdir)):
                if (first_part_json_raw_file_name_without_ext in fileName):  # Here main part of json file should match
                    print("Located {0} matching {1}".format(fileName, first_part_json_raw_file_name_without_ext))
                    rootDir2 = subdir
                    print("rootDir2={0}".format(rootDir2))
                    for fileName2 in os.listdir(subdir):
                        subdir2 = os.path.join(rootDir2, fileName2)
                        if (os.path.isdir(subdir2)):
                            if (second_part_json_raw_file_name_without_ext in fileName2 and
                                self.export_format in fileName2): # Here version number of json file should match
                                print("Located {0} matching {1}".format(fileName2, second_part_json_raw_file_name_without_ext))
                                self.datasets[aDatasetIdentifier]['image_dir'] = subdir2
                                break

    '''
    Export format = semantic
    Folder name : v1.0_semantic, contains XXX.png, XXX_label_ground-truth.png,
    XXX_label_ground-truth_semantic.png
    '''
    def arrangeMaskFiles(self, aDatasetIdentifier):
        print("Arrange mask files starts ...")
        currentDir = os.getcwd()
        if not os.path.isdir(self.destination_dir):
            os.mkdir(self.destination_dir)
        dictForMaskFileArrangement = {'image_dir': self.datasets[aDatasetIdentifier]['image_dir'], 'destination_image_dir':self.destination_dir}
        (dest_image_dir, segmt_dir, panop_dir) = ArrangeMaskFiles.main(dictForMaskFileArrangement)
        print("image_dir={0}, segmentation_dir={1}, panoptic_dir={2}\n".format(dest_image_dir, segmt_dir, panop_dir))
        self.destination_image_dir = dest_image_dir
        self.destination_segmentation_image_dir = segmt_dir
        if not os.path.isdir(self.destination_image_dir):
            print("Directory {0} was not created. Aborting...\n".format(self.destination_image_dir))
            sys.exit(1)  
        if not os.path.isdir(self.destination_segmentation_image_dir):
            print("Directory {0} was not created. Aborting...\n".format(self.destination_segmentation_image_dir))
            sys.exit(1)
        print("Arrange mask files ended ...")


    def relabelMaskImages(self):
        print("Mask relabeling operation starts ...")
        dictForMaskRelabel = {'image_dir': self.destination_segmentation_image_dir, 'image_file': None,
                              'userlabel': self.userlabel}
        print("image_dir={0}, userlabel={1}".format(self.destination_segmentation_image_dir, self.userlabel))
        self.destination_semantic_relabel_dir = ProcessMaskForSemanticSegmentation.main(dictForMaskRelabel)
        if not os.path.isdir(self.destination_semantic_relabel_dir):
            print("Directory {0} was not created. Aborting...\n".format(self.destination_semantic_relabel_dir))
            sys.exit(1)
        print("Mask relabeling operation ended ...")

    def randomizeFileNames(self):
        print("Randomize file names starts ...")
        dictForRandomizeFileNames = {'image_dir':self.destination_image_dir, 'mask_dir':self.destination_semantic_relabel_dir}
        (new_image_dir, new_masked_image_dir) = randomiseFileName.main(dictForRandomizeFileNames)
        self.destination_image_dir = new_image_dir
        self.destination_segmentation_image_dir = new_masked_image_dir
        print("new image dir = {0}, new segmentation dir = {1}".format(new_image_dir, new_masked_image_dir))
        print("Randomize file names done ...")    

    def convertIRToRGB(self):
        # Just convert the images in the directory XXX_semantic_random_filenames. 
        # Do not touch any other directories 
        print("IR2RGB operation starting ...")
        convertDict = {'image_file': None, 'image_dir': self.destination_image_dir, 'operation':'convert2rgb'}
        print("image_dir={0}".format(self.destination_image_dir))
        Imageutilities.main(convertDict)
        print("IR2RGB operation done ...")


    def resizeDataset(self):
        # The mask images are reorganized in their own directories in previous step.
        # The IR images are converted to equivalent RGB images in previous step.
        # So now we have to traverse one directory up and visit 2 directories only
        # Directory containing ir->color images and directory which has ground truth such as
        # segmentation. It is waste to convert images in other directories, causes confusion!
        # and carry out resize/mirror etc. operations.
        # This operation has assumption that earlier ir->rgb conversion has been run
        print("Resizing operation starting ...")
        print("destination_image_dir={0}".format(self.destination_image_dir))
        oneDirUp = os.path.abspath(os.path.join(self.destination_image_dir, os.pardir))
        for subdir in os.listdir(oneDirUp):
            #if ('semantic_ground_truth_relabel_random_filenames' in subdir):
            print("subdir={0}".format(subdir))
            if ('semantic_ground_truth_relabel' in subdir and (not 'resized' in subdir)):
                fullpath = os.path.join(oneDirUp, subdir)
                print("subdir fullpath={0}".format(fullpath))
                resizeDict = {'image_file': None, 'image_dir': fullpath, 'operation': 'resize', 'new_image_size': self.new_image_size, 'grayscale' : True}
                Imageutilities.main(resizeDict)
            elif ('_gray2color' in subdir and (not 'resized' in subdir)):
                fullpath = os.path.join(oneDirUp, subdir)
                print("subdir fullpath={0}".format(fullpath))
                resizeDict = {'image_file': None, 'image_dir': fullpath, 'operation': 'resize', 'new_image_size': self.new_image_size, 'grayscale' : False}
                Imageutilities.main(resizeDict)
        print("Resizing operation done ...")

    

    def rotateDataset(self):
        print("rotateDataset operation starting ...")
        oneDirUp = os.path.abspath(os.path.join(self.destination_image_dir, os.pardir))
        for subdir in os.listdir(oneDirUp):
            #if ('resized' in subdir): # This will match image directory name and mask directory name
            if ('images_gray2color' in subdir or 'semantic_ground_truth_relabel' in subdir):
                fullpath = os.path.join(oneDirUp, subdir)
                isgrayscale = False
                if ('ground_truth' in subdir):
                    isgrayscale = True
                elif ('gray2scale' in subdir):
                    isgrayscale = False
                rotateDict = {'image_file': None, 'image_dir': fullpath, 'operation': 'rotate', 'rotation_angle': 2, 'grayscale' : isgrayscale}    
                Imageutilities.main(rotateDict)    
                rotateDict = {'image_file': None, 'image_dir': fullpath, 'operation': 'rotate', 'rotation_angle': 5, 'grayscale' : isgrayscale}    
                Imageutilities.main(rotateDict)    
                rotateDict = {'image_file': None, 'image_dir': fullpath, 'operation': 'rotate', 'rotation_angle': 7, 'grayscale' : isgrayscale}    
                Imageutilities.main(rotateDict)    
                rotateDict = {'image_file': None, 'image_dir': fullpath, 'operation': 'rotate', 'rotation_angle': -2, 'grayscale' : isgrayscale}    
                Imageutilities.main(rotateDict)    
                rotateDict = {'image_file': None, 'image_dir': fullpath, 'operation': 'rotate', 'rotation_angle': -5, 'grayscale' : isgrayscale}    
                Imageutilities.main(rotateDict)    
                rotateDict = {'image_file': None, 'image_dir': fullpath, 'operation': 'rotate', 'rotation_angle': -7, 'grayscale' : isgrayscale}    
                Imageutilities.main(rotateDict)    
        print("rotateDataset operation done ...")

    def brightenImage(self):
        print("brightenImage operation starting ...")
        oneDirUp = os.path.abspath(os.path.join(self.destination_image_dir, os.pardir))
        mask_dir = ''
        image_dir = ''
        for subdir in os.listdir(oneDirUp):
            fullpath = os.path.join(oneDirUp, subdir)
            print("subdir={0}".format(subdir))
            if ('gray2color_resized_rotated' in subdir): 
                image_dir = os.path.join(oneDirUp, subdir)
            elif (('_resized_rotated' in subdir)):
                mask_dir = fullpath
                oneDirUp = os.path.abspath(os.path.join(fullpath, os.pardir))
                mask_brightness_dir_name = os.path.join(oneDirUp, subdir + '_brightness_adjusted')
                print("brightness_dir_name for masks={0}".format(mask_brightness_dir_name))
                if not os.path.isdir(mask_brightness_dir_name):
                    os.mkdir(mask_brightness_dir_name)
        
        # Here images in image_dir and mask_dir have to be processed together because for mask
        # images it is only copy but to the appropriately named file
        brightenDir = {'image_file': None, 'image_dir': image_dir, 'mask_dir' : mask_dir, 'operation': 'brighten', 'increment':'5'}
        Imageutilities.main(brightenDir)
        brightenDir = {'image_file': None, 'image_dir': image_dir, 'mask_dir' : mask_dir, 'operation': 'brighten', 'increment':'10'}
        Imageutilities.main(brightenDir)
        brightenDir = {'image_file': None, 'image_dir': image_dir, 'mask_dir' : mask_dir, 'operation': 'brighten', 'increment':'15'}
        Imageutilities.main(brightenDir)
        brightenDir = {'image_file': None, 'image_dir': image_dir, 'mask_dir' : mask_dir, 'operation': 'brighten', 'increment':'-5'}
        Imageutilities.main(brightenDir)
        brightenDir = {'image_file': None, 'image_dir': image_dir, 'mask_dir' : mask_dir, 'operation': 'brighten', 'increment':'-10'}
        Imageutilities.main(brightenDir)
        brightenDir = {'image_file': None, 'image_dir': image_dir, 'mask_dir' : mask_dir, 'operation': 'brighten', 'increment':'-15'}
        Imageutilities.main(brightenDir)
        brightenDir = {'image_file': None, 'image_dir': image_dir, 'mask_dir' : mask_dir, 'operation': 'brighten'} # copy base image
        Imageutilities.main(brightenDir)
        print("brightenImage operation done ...")
    

    def mirrorImage(self):
        print("mirrorImage operation starting ...")
        oneDirUp = os.path.abspath(os.path.join(self.destination_image_dir, os.pardir))
        for subdir in os.listdir(oneDirUp):
            #if ('resized_rotated_brightness_adjusted' in subdir):
            if ('_rotated' in subdir):
                fullpath = os.path.join(oneDirUp, subdir)
                mirrorDir = {'image_file': None, 'image_dir': fullpath, 'operation': 'mirror'}
                print("image_dir={0}".format(fullpath))
                Imageutilities.main(mirrorDir)
        print("mirrorImage operation done ...")

    def separateTestingImages(self):
        if (self.file_test_images_list is None):
            return
        print("separateTestingImages starts...\n")
        if (False == os.path.exists(self.file_test_images_list)):
            print("File {0} does not exist.".format(self.file_test_images_list))

        oneDirUp = os.path.abspath(os.path.join(self.destination_image_dir, os.pardir))
        print("destination dir={0}\n".format(self.destination_image_dir))
        print("parent dir={0}\n".format(oneDirUp))

        if (not os.path.isdir(os.path.join(oneDirUp, 'select_test_images'))):
            os.mkdir(os.path.join(oneDirUp, 'select_test_images'))
        if (not os.path.isdir(os.path.join(oneDirUp, 'select_test_masks'))):
            os.mkdir(os.path.join(oneDirUp, 'select_test_masks'))
        
        fh = open(self.file_test_images_list)
        lines = fh.readlines()
        fh.close()

        testFileNames = []
        for eachline in lines:
            print("eachline={0}\n".format(eachline))
            (dirIdentifier, fileName) = eachline.split('/')
            fileName = fileName.strip()
            fileName = fileName.replace('.', '_', 1)
            fileName = "a" + fileName
            print("filename={0}".format(fileName))
            testFileNames.append(fileName)
        for subdir in os.listdir(oneDirUp):
            if ('_gray2color' in subdir):
                fullpath = os.path.join(oneDirUp, subdir)
                print("subdir={0}".format(fullpath))
                for eachfile in os.listdir(fullpath):
                    if (eachfile in testFileNames):
                        print("Filename matched = {0}".format(eachfile))
                        shutil.move(os.path.join(fullpath, eachfile), os.path.join(oneDirUp, 'select_test_images'))
            elif ('semantic_ground_truth_relabel' in subdir):
                fullpath = os.path.join(oneDirUp, subdir)
                print("subdir={0}".format(fullpath))
                for eachfile in os.listdir(fullpath):
                    if (eachfile in testFileNames):
                        print("Filename matched = {0}".format(eachfile))
                        shutil.move(os.path.join(fullpath, eachfile), os.path.join(oneDirUp, 'select_test_masks'))
        
        print("separateTestingImages ended...\n")

    def createCumulativeDataset(self):
        if ((self.mode == "downloadonly") or (self.mode == "downloadandaugment")):
            for adataset in self.datasets:
                self.downloadDataset(adataset)

        if ((self.mode == "augmentonly") or (self.mode == "downloadandaugment")):
            for adataset in self.datasets:      
                self.arrangeMaskFiles(adataset)
            self.relabelMaskImages()
            #self.randomizeFileNames()
            self.convertIRToRGB()
            #self.separateTestingImages()
            #self.resizeDataset()
            self.rotateDataset()
            #self.brightenImage()
            self.mirrorImage()

    #def validateDataSize(self):
    #    validateDict = {''}
    #    ValidateInputData.main(validateDict)

    def create(args):
        dsCreator = DatasetCreator(args['account_name'], args['dataset_identifier'], 
                                   args['release_version'], args['userlabel'], args['export_format'],
                                   #args['new_image_size'], args['mode'], args['outputdatadir'], args['file_test_images_list'])
                                   None, args['mode'], args['outputdatadir'], None)
        print(args)
        #dsCreator = DatasetCreator(args.account_name, args.dataset_identifier, 
        #                           args.release_version, args.userlabel, args.export_format,
        #                           args.new_image_size, args.mode, args.outputdatadir, args.file_test_images_list)

        dsCreator.createCumulativeDataset()                                   




In [5]:
'''
sys.argv = "--account_name = 'ssnirgudkar' --dataset_identifier = 'all' --release_version = 'v2.0' 
--userlabel = 'segmentation' --export_format = 'semantic' --mode = 'downloadonly' 
--outputdatadir = 'drive/My Drive/IRDatasetFinal'"
print (sys.argv)
'''

args = {'account_name' :'ssnirgudkar', "dataset_identifier" :'PilotIR', "release_version" :'v2.0', "userlabel" :'segmentation', "export_format" :'semantic', "mode" :'downloadandaugment', "outputdatadir" : '/content/drive/MyDrive/IRDatasetFinal'} 
print (args)

'''
args['account_name'] = 'ssnirgudkar'
args['dataset_identifier'] = 'all'
args['release_version'] = 'v2.0'
args['userlabel'] = 'segmentation'
args['export_format'] = 'semantic'
args['mode'] = 'downloadonly'
args['outputdatadir'] = 'drive/My Drive/IRDatasetFinal'
'''
DatasetCreator.create(args)

'''
parser = argparse.ArgumentParser(description="Utility to create dataset (end to end)")
parser.add_argument("--account_name", type=str, 
                        help="Provide account name of segments.ai dataset")
parser.add_argument("--dataset_identifier", type=str, 
                        help="Provide name of segments.ai dataset")
parser.add_argument("--release_version", type=str, 
                        help="Provide release file number")
parser.add_argument("--userlabel", type=str, 
                        help="Provide user label : ground-truth or segmentation")                      
parser.add_argument("--new_image_size", type=str, 
                        help="Image size in height,width format") 
parser.add_argument("--export_format", type=str, default=None,
                        help="semantic/panoptic, see https://docs.segments.ai/export")
parser.add_argument("--mode", type=str, help="one of downloadonly, augmentonly, downloadandaugment")
parser.add_argument("--outputdatadir", type=str, help="Specify if the mode is augmentonly or downloadandaugment")
parser.add_argument("--file_test_images_list", type=str, help="Specify full path of file containing list of test images, this is optional")


args = parser.parse_args()
'''


{'account_name': 'ssnirgudkar', 'dataset_identifier': 'PilotIR', 'release_version': 'v2.0', 'userlabel': 'segmentation', 'export_format': 'semantic', 'mode': 'downloadandaugment', 'outputdatadir': '/content/drive/MyDrive/IRDatasetFinal'}
Initialized successfully.
image_dir for PilotIR=segments/ssnirgudkar_PilotIR/v2.0_semantic
{'account_name': 'ssnirgudkar', 'dataset_identifier': 'PilotIR', 'release_version': 'v2.0', 'userlabel': 'segmentation', 'export_format': 'semantic', 'mode': 'downloadandaugment', 'outputdatadir': '/content/drive/MyDrive/IRDatasetFinal'}
account name = ssnirgudkar
dataset path = PilotIR
version = v2.0
user label = segmentation
Initialized successfully.
{'uuid': '23ae2992-b7e2-41e8-ba27-8439a2b6c160', 'name': 'v2.0', 'description': 'Final release for December', 'release_type': 'JSON', 'attributes': {'url': 'https://segmentsai-prod.s3.eu-west-2.amazonaws.com/releases/23ae2992-b7e2-41e8-ba27-8439a2b6c160.json'}, 'status': 'SUCCEEDED', 'status_info': '', 'created_at'

100%|██████████| 79/79 [00:03<00:00, 20.19it/s]


Initialized dataset with 79 images.
Exporting dataset. This may take a while...


  return _convert(image, np.uint8, force_copy)
100%|██████████| 79/79 [00:01<00:00, 57.93it/s]


Exported to segments/ssnirgudkar_PilotIR/v2.0
Images downloaded to dir = segments/ssnirgudkar_PilotIR/v2.0
Directory renamed to = segments/ssnirgudkar_PilotIR/v2.0_semantic
Dataset downloaded successfully.
json_file=['PilotIR-v2.0.json']
dir=, json_raw_file_name=PilotIR-v2.0.json
json_raw_file_name_without_ext=PilotIR-v2.0
first_part_json_raw_file_name_without_ext = PilotIR
second_part_json_raw_file_name_without_ext = v2.0
Located ssnirgudkar_PilotIR matching PilotIR
rootDir2=segments/ssnirgudkar_PilotIR
Located v2.0_semantic matching v2.0
Arrange mask files starts ...
Starting mask arrangement ...
ArrangeMaskFiles: dirName=segments/ssnirgudkar_PilotIR/v2.0_semantic
ArrangeMaskFiles: eachFile = a1602783046_314322_label_segmentation_semantic.png
ArrangeMaskFiles: eachFile = a1602782502_365406.png
ArrangeMaskFiles: eachFile = a1603391605_223658_label_segmentation.png
ArrangeMaskFiles: eachFile = a1602782532_698585_label_segmentation_semantic.png
ArrangeMaskFiles: eachFile = a1602783100_5

AssertionError: ignored