This notebook creates subsets of a large dataset only based on randomly selecting files up to a user defined maximum size (in GB) in the variable "max_size". 

Source_directory and target_directory are variables that need to be set

In [1]:
import os
import random
import shutil
import glob

def get_dataset_size(dataset_dir):
    
    total_size = 0
    for root, dirs, files in os.walk(dataset_dir):
        for file in files:
            file_path = os.path.join(root, file)
            total_size += os.path.getsize(file_path)

    # Convert total_size to a more human-readable format (e.g., MB, GB)
    total_size_mb = total_size / (1024 * 1024)
    total_size_gb = total_size / (1024 * 1024 * 1024)

    return total_size, total_size_mb, total_size_gb

In [6]:
def create_subset(source_dir, source_folder, target_dir, target_folder, batch_size, max_size):
    '''
    Creates subset based on a maximum size in GB
    
    Args:
        source_dir, source_folder (~string): path and name of directory from where access to the data. It must contain 
        images and labels folders containing pre and post images (png format) and info (json format) files respectively.
        target_dir, target_folder (~string): path and name of directory where to store new dataset
        batch_size (~int): how many images will be randomly selected in each iteration (larger the parameter, less exact the desired size of the subset)
        max_size (~float): Maximum desired size in Gb.
    '''
    total_ids = []
    total_size_gb = 0
    while total_size_gb <=  max_size:
        #list of name files in images and labels directories for the folder specified in "source_folder" variable
        tif_files = [file for file in os.listdir(source_dir+"/"+source_folder+"/images") if file.endswith('.png')]
        json_files = [file for file in os.listdir(source_dir+"/"+source_folder+"/labels") if file.endswith('.json')]
        
        # collecting id number for all the files in "source_folder"
        ids = list(set([x.rsplit("_")[1] for x in tif_files]))
        
        # Randomly select a subset of ids
        selected_ids = random.sample(ids, batch_size)
        
        #getting list of files, given certain id number
        
        # collecting files in images/hold
        tif_files_selected = []
        for id in selected_ids:
            if id not in total_ids:
                tif_files_selected.extend(glob.glob(source_dir+"/"+source_folder+"/images/*"+id+"*.png"))
        #collecting files in labels/hold
        json_files_selected = []
        for id in selected_ids:
            if id not in total_ids:
                json_files_selected.extend(glob.glob(source_dir+"/"+source_folder+"/labels/*"+id+"*.json"))        
        
        # save ids to not repeat
        #print("ids in this iteration: ",total_ids)
        total_ids.extend(selected_ids)
        
        # Copy the selected files to the target directory (target_folder/images)
        for path in tif_files_selected:
            file = path.rsplit("/")[-1]
            source_path = os.path.join(source_dir, source_folder,"images",file)
            target_path = os.path.join(target_dir, target_folder,"images", file)
            #print(source_path)
            #print(target_path)
            shutil.copyfile(source_path, target_path)

        # Copy the selected files to the target directory (target_folder/labels)
        for path in json_files_selected:
            file = path.rsplit("/")[-1]
            source_path = os.path.join(source_dir, source_folder,"labels",file)
            target_path = os.path.join(target_dir, target_folder,"labels", file)
            #print(source_path)
            #print(target_path)
            shutil.copyfile(source_path, target_path)
        total_size, total_size_mb, total_size_gb = get_dataset_size(os.path.join(target_dir, target_folder))   
        print("current target directory size (GB): ",total_size_gb)
        #print("the final size (GB) of the target directory ",target_dir," is: ",total_size_gb)
    print("Max. size reached!: ",total_size_gb)

Before running the following, change source_directory and target_directory variables

In [7]:
#sub_dirs = ["hold","test","tier1"] # directories in the source directory from which I want to extract data. tier3 has another dataset with different disasters

# Set the paths and parameters for creating the subset 
source_directory = '../data/last_subset'  # Path to your large dataset directory
target_directory = '../data/test_subset'  # Path to the directory where the subset will be created

The data is divided between training and test in the proportions 0.6 and 0.2 respectively.

See below variable "max_size" for each folder 

In [8]:
# create TRAINING set

source_folder = "train"
target_folder = "train"
#images
train_images_dir = os.path.join(target_directory, target_folder,"images")
#print(train_images_dir)
#labels
train_labels_dir = os.path.join(target_directory, target_folder,"labels")
# create directories
try:
    os.makedirs(train_images_dir)
    os.makedirs(train_labels_dir)
except FileExistsError:
    print(f"Directory '{train_images_dir}' already exists.")    
    print(f"Directory '{train_labels_dir}' already exists.")  

# set parameters training set
batch_size=5       # Number of images that are randomly selected in each iteration per folder in "sub_dirs" variable
max_size = 1.2 #in Gb

# Call the function to create the subset TRAINING (tier1 is training in original dataset)
create_subset(source_directory, source_folder, target_directory, target_folder, batch_size, max_size)

Directory '../data/test_subset/train/images' already exists.
Directory '../data/test_subset/train/labels' already exists.
current target directory size (GB):  0.035128360614180565
current target directory size (GB):  0.07047110889106989
current target directory size (GB):  0.1134218629449606
current target directory size (GB):  0.15796535089612007
current target directory size (GB):  0.20389750134199858
current target directory size (GB):  0.24228623881936073
current target directory size (GB):  0.270885176025331
current target directory size (GB):  0.31638297718018293
current target directory size (GB):  0.3441677941009402
current target directory size (GB):  0.38357523549348116
current target directory size (GB):  0.40073469653725624
current target directory size (GB):  0.48580285627394915
current target directory size (GB):  0.5434338822960854
current target directory size (GB):  0.6092087794095278
current target directory size (GB):  0.6448980998247862
current target directory size

In [9]:
# create TEST set

source_folder = "test"
target_folder = "test"
#images
train_images_dir = os.path.join(target_directory, target_folder,"images")
print(train_images_dir)
#labels
train_labels_dir = os.path.join(target_directory, target_folder,"labels")
# create directories
try:
    os.makedirs(train_images_dir)
    os.makedirs(train_labels_dir)
except FileExistsError:
    print(f"Directory '{train_images_dir}' already exists.")    
    print(f"Directory '{train_labels_dir}' already exists.") 

# set parameters training set
batch_size=5       # Number of images that are randomly selected in each iteration 
max_size = 0.4 #in Gb

# Call the function to create the subset TRAINING (tier1 is training in original dataset)
create_subset(source_directory, source_folder, target_directory, target_folder, batch_size, max_size)

../data/test_subset/test/images
current target directory size (GB):  0.022559155710041523
current target directory size (GB):  0.04454149957746267
current target directory size (GB):  0.0694432957097888
current target directory size (GB):  0.08138161338865757
current target directory size (GB):  0.10628349054604769
current target directory size (GB):  0.1286655217409134
current target directory size (GB):  0.15267665963619947
current target directory size (GB):  0.17895563505589962
current target directory size (GB):  0.19639226607978344
current target directory size (GB):  0.21783117298036814
current target directory size (GB):  0.2393433516845107
current target directory size (GB):  0.2542531508952379
current target directory size (GB):  0.27163968048989773
current target directory size (GB):  0.28503435757011175
current target directory size (GB):  0.29565922915935516
current target directory size (GB):  0.3195598116144538
current target directory size (GB):  0.33307626005262136
cur