In [28]:
import os
import random
import shutil
from sklearn.model_selection import train_test_split

In [29]:
# Set seed for reproducibility
random.seed(42)

curr_path = os.getcwd()
print(curr_path)

D:\Masters\OVGU\VII_Semester\Thesis\jupyter_notebooks


In [30]:
# Define the paths
data_dir = os.path.join(curr_path, 'Data\\Dataset\\Full Dataset\\tiles')  # Folder with all your tiles
mask_dir =  os.path.join(curr_path, 'Data\\Dataset\\Full Dataset\\masks')  # Folder with all your masksd
output_dir = os.path.join(curr_path, 'Data\\Dataset\\Final_Dataset')  # Where you want to save train, val, and test sets

In [31]:
# Create output directories
train_images_dir = os.path.join(output_dir, 'train', 'tiles')
train_masks_dir = os.path.join(output_dir, 'train', 'masks')
val_images_dir = os.path.join(output_dir, 'val', 'tiles')
val_masks_dir = os.path.join(output_dir, 'val', 'masks')
test_images_dir = os.path.join(output_dir, 'test', 'tiles')
test_masks_dir = os.path.join(output_dir, 'test', 'masks')

In [32]:
# Ensure directories exist
os.makedirs(train_images_dir, exist_ok=True)
os.makedirs(train_masks_dir, exist_ok=True)
os.makedirs(val_images_dir, exist_ok=True)
os.makedirs(val_masks_dir, exist_ok=True)
os.makedirs(test_images_dir, exist_ok=True)
os.makedirs(test_masks_dir, exist_ok=True)

In [33]:
print(train_images_dir)

D:\Masters\OVGU\VII_Semester\Thesis\jupyter_notebooks\Data\Dataset\Final_Dataset\train\tiles


In [34]:
# Get list of all tiles (assumes image and mask names are related)
all_tiles = sorted(os.listdir(data_dir))
all_masks = sorted(os.listdir(mask_dir))

In [35]:
# Ensure the number of tiles matches the number of masks
assert len(all_tiles) == len(all_masks), "Mismatch between number of tiles and masks!"

In [36]:
# Get list of all tiles
all_tiles = [f for f in sorted(os.listdir(data_dir)) if f.startswith('input_')]

In [37]:
# Function to get the corresponding mask file from a tile name
def get_mask_name(tile_name):
    return tile_name.replace('input_', 'mask_')

In [38]:
# Ensure all corresponding masks exist
all_masks = [get_mask_name(tile) for tile in all_tiles]
for mask in all_masks:
    assert os.path.exists(os.path.join(mask_dir, mask)), f"Mask {mask} does not exist!"

In [39]:
# Split data into train (70%), val (15%), and test (15%)
train_tiles, test_tiles, train_masks, test_masks = train_test_split(
    all_tiles, all_masks, test_size=0.3, random_state=42
)

In [40]:
val_tiles, test_tiles, val_masks, test_masks = train_test_split(
    test_tiles, test_masks, test_size=0.5, random_state=42
)

In [41]:
# Function to copy files
def copy_files(file_list, src_dir, dst_dir, file_type='tiles'):
    for file_name in file_list:
        src_path = os.path.join(src_dir, file_name)
        dst_path = os.path.join(dst_dir, file_name)
        shutil.copy(src_path, dst_path)

In [42]:
# Copy the train, val, and test data for both tiles and masks
copy_files(train_tiles, data_dir, train_images_dir)
copy_files([get_mask_name(tile) for tile in train_tiles], mask_dir, train_masks_dir)

copy_files(val_tiles, data_dir, val_images_dir)
copy_files([get_mask_name(tile) for tile in val_tiles], mask_dir, val_masks_dir)

copy_files(test_tiles, data_dir, test_images_dir)
copy_files([get_mask_name(tile) for tile in test_tiles], mask_dir, test_masks_dir)

print("Data split and saved successfully!")

Data split and saved successfully!


In [22]:
print(train_images_dir)

D:\Masters\OVGU\VII_Semester\Thesis\jupyter_notebooks\Data/Final_Dataset\train\tiles
