#### This notebook has the first version of the splitting code, only based on randomly selecting files up to a user defined maximum size (in GB) in the variable "max_size". 

#### source_directory and target_directory are variables that need to be set

In [None]:
import os
import random
import shutil
import glob

def get_dataset_size(dataset_dir):
    total_size = 0
    for root, dirs, files in os.walk(dataset_dir):
        for file in files:
            file_path = os.path.join(root, file)
            total_size += os.path.getsize(file_path)

    # Convert total_size to a more human-readable format (e.g., MB, GB)
    total_size_mb = total_size / (1024 * 1024)
    total_size_gb = total_size / (1024 * 1024 * 1024)

    return total_size, total_size_mb, total_size_gb

In [None]:
def create_subset(source_dir, source_folder, target_dir, target_folder, batch_size, max_size):
    
    total_ids = []
    total_size_gb = 0
    while total_size_gb <=  max_size:
        #list of name files in images and labels directories for the folder specified in "source_folder" variable
        tif_files = [file for file in os.listdir(source_dir+"/"+source_folder+"/images") if file.endswith('.tif')]
        json_files = [file for file in os.listdir(source_dir+"/"+source_folder+"/labels") if file.endswith('.json')]
        
        # collecting id number for all the files in "source_folder"
        ids = list(set([x.rsplit("_")[1] for x in tif_files]))
        
        # Randomly select a subset of ids
        selected_ids = random.sample(ids, batch_size)
        
        #getting list of files, given certain id number
        
        # collecting files in images/hold
        tif_files_selected = []
        for id in selected_ids:
            if id not in total_ids:
                tif_files_selected.extend(glob.glob(source_dir+"/"+source_folder+"/images/*"+id+"*.tif"))
        #collecting files in labels/hold
        json_files_selected = []
        for id in selected_ids:
            if id not in total_ids:
                json_files_selected.extend(glob.glob(source_dir+"/"+source_folder+"/labels/*"+id+"*.json"))        
        
        # save ids to not repeat
        #print("ids in this iteration: ",total_ids)
        total_ids.extend(selected_ids)
        
        # Copy the selected files to the target directory (target_folder/images)
        for path in tif_files_selected:
            file = path.rsplit("/")[-1]
            source_path = os.path.join(source_dir, source_folder,"images",file)
            target_path = os.path.join(target_dir, target_folder,"images", file)
            #print(source_path)
            #print(target_path)
            shutil.copyfile(source_path, target_path)

        # Copy the selected files to the target directory (target_folder/labels)
        for path in json_files_selected:
            file = path.rsplit("/")[-1]
            source_path = os.path.join(source_dir, source_folder,"labels",file)
            target_path = os.path.join(target_dir, target_folder,"labels", file)
            #print(source_path)
            #print(target_path)
            shutil.copyfile(source_path, target_path)
        total_size, total_size_mb, total_size_gb = get_dataset_size(os.path.join(target_dir, target_folder))   
        print("current target directory size (GB): ",total_size_gb)
        #print("the final size (GB) of the target directory ",target_dir," is: ",total_size_gb)
    print("Max. size reached!: ",total_size_gb)

Before running the following, change source_directory and target_directory variables

In [None]:
#sub_dirs = ["hold","test","tier1"] # directories in the source directory from which I want to extract data. tier3 has another dataset with different disasters

# Set the paths and parameters for creating the subset 
source_directory = '/Volumes/Elements/data_buidings/geotiffs'  # Path to your large dataset directory
target_directory = '/Users/gmeneses/DScourse/00_capstone/Capstone_Building_Damage/data/subset'  # Path to the directory where the subset will be created

The data is divided between training, test and holdout in the proportions 0.6 , 0.2, 0.2 respectively.

The resulting data set has 8.71 GB

See below variable "max_size" for each folder 

In [None]:
# create TRAINING set

source_folder = "tier1"
target_folder = "training"
#images
train_images_dir = os.path.join(target_directory, target_folder,"images")
print(train_images_dir)
#labels
train_labels_dir = os.path.join(target_directory, target_folder,"labels")
# create directories
os.makedirs(train_images_dir)
os.makedirs(train_labels_dir)

# set parameters training set
batch_size=5       # Number of images that are randomly selected in each iteration per folder in "sub_dirs" variable
max_size = 4.8 #in Gb

# Call the function to create the subset TRAINING (tier1 is training in original dataset)
create_subset(source_directory, source_folder, target_directory, target_folder, batch_size, max_size)

In [None]:
# create TEST set

source_folder = "test"
target_folder = "test"
#images
train_images_dir = os.path.join(target_directory, target_folder,"images")
print(train_images_dir)
#labels
train_labels_dir = os.path.join(target_directory, target_folder,"labels")
# create directories
os.makedirs(train_images_dir)
os.makedirs(train_labels_dir)

# set parameters training set
batch_size=5       # Number of images that are randomly selected in each iteration 
max_size = 1.6 #in Gb

# Call the function to create the subset TRAINING (tier1 is training in original dataset)
create_subset(source_directory, source_folder, target_directory, target_folder, batch_size, max_size)

In [None]:
# create HOLDOUT set

source_folder = "hold"
target_folder = "holdout"
#images
train_images_dir = os.path.join(target_directory, target_folder,"images")
print(train_images_dir)
#labels
train_labels_dir = os.path.join(target_directory, target_folder,"labels")
# create directories
os.makedirs(train_images_dir)
os.makedirs(train_labels_dir)

# set parameters training set
batch_size=5       # Number of images that are randomly selected in each iteration 
max_size = 1.6 #in Gb

# Call the function to create the subset TRAINING (tier1 is training in original dataset)
create_subset(source_directory, source_folder, target_directory, target_folder, batch_size, max_size)