In [21]:
import os
import random
import shutil
import rasterio
from rasterio.windows import Window
from tqdm import tqdm
tqdm.pandas()

In [22]:
def create_chips(image_path, output_dir):
    with rasterio.open(image_path) as dataset:
        height, width = dataset.shape

        for row in range(0, height, chip_size):
            for col in range(0, width, chip_size):
                chip_window = Window(col, row, chip_size, chip_size)
                chip = dataset.read(window=chip_window)

                chip_filename = f"row{row}_col{col}_" + os.path.splitext(os.path.basename(image_path))[0] + ".tif"
                chip_filepath = os.path.join(output_dir, chip_filename)

                with rasterio.open(
                    chip_filepath,
                    'w',
                    driver='GTiff',
                    height=chip_size,
                    width=chip_size,
                    count=dataset.count,
                    dtype=dataset.dtypes[0],
                    crs=dataset.crs,
                    transform=rasterio.windows.transform(chip_window, dataset.transform)
                ) as chip_dataset:
                    chip_dataset.write(chip, indexes=list(range(1, dataset.count + 1)))

In [23]:
image_path ='work/scorreacardo_umass_edu/DeepSatGSD/data/interim/GSD_150cm/AOP_AF17_Q317_V0_502_308_133_9_R3C5_2017-03-14_WV02_resampled_image_1.5m.tif'
os.path.splitext(os.path.basename(image_path))

('AOP_AF17_Q317_V0_502_308_133_9_R3C5_2017-03-14_WV02_resampled_image_1.5m',
 '.tif')

In [24]:
# Set the paths to your original data directory and the destination dataset directory
original_data_dir = "/work/scorreacardo_umass_edu/DeepSatGSD/data/interim"
dataset_dir = "/work/scorreacardo_umass_edu/DeepSatGSD/data/processed"

In [13]:
sensors = ['GSD_50cm', 'GSD_65cm', 'GSD_80cm', 'GSD_100cm',
          'GSD_124cm', 'GSD_150cm', 'GSD_175cm', 'GSD_200cm', 
           'GSD_250cm', 'GSD_300cm']

In [15]:
# Define the train, validation, and test ratios
train_ratio = 0.7
validation_ratio = 0.15
test_ratio = 0.15

# Set the chip size
chip_size = 256

# Set a random seed for reproducibility
random.seed(42)

In [16]:
# Create the directories for train, validation, and test sets
train_dir = os.path.join(dataset_dir, 'train')
validation_dir = os.path.join(dataset_dir, 'validation')
test_dir = os.path.join(dataset_dir, 'test')

os.makedirs(train_dir, exist_ok=True)
os.makedirs(validation_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)

In [32]:
# Loop through each sensor and distribute the images across the sets
for sensor_name in sensors:
    print(f"sensor name: {sensor_name}")
    sensor_images = [filename for filename in os.listdir(os.path.join(original_data_dir, sensor_name))]
    random.shuffle(sensor_images)

    train_count = int(len(sensor_images) * train_ratio)
    validation_count = int(len(sensor_images) * validation_ratio)
    test_count = len(sensor_images) - train_count - validation_count

    train_images = sensor_images[:train_count]
    print(f"size of train images: {len(train_images)}")
    validation_images = sensor_images[train_count:train_count + validation_count]
    print(f"size of validation images: {len(validation_images)}")
    test_images = sensor_images[train_count + validation_count:]
    print(f"size of test images: {len(test_images)}")
    
    # Create the sub-directories for train, validation, and test sets for each sensor
    sensor_train_dir = os.path.join(train_dir, sensor_name)
    sensor_validation_dir = os.path.join(validation_dir, sensor_name)
    sensor_test_dir = os.path.join(test_dir, sensor_name)
    
    os.makedirs(sensor_train_dir, exist_ok=True)
    os.makedirs(sensor_validation_dir, exist_ok=True)
    os.makedirs(sensor_test_dir, exist_ok=True)
    
    # Move the images to their respective directories and create chips
    for image in tqdm(train_images):
        src = os.path.join(original_data_dir + f"/{sensor_name}", image)
        dst = os.path.join(sensor_train_dir, image)
        shutil.copyfile(src, dst)
        create_chips(dst, sensor_train_dir)

    for image in tqdm(validation_images):
        src = os.path.join(original_data_dir + f"/{sensor_name}", image)
        dst = os.path.join(sensor_validation_dir, image)
        shutil.copyfile(src, dst)
        create_chips(dst, sensor_validation_dir)

    for image in tqdm(test_images):
        src = os.path.join(original_data_dir + f"/{sensor_name}", image)
        dst = os.path.join(sensor_test_dir, image)
        shutil.copyfile(src, dst)
        create_chips(dst, sensor_test_dir)
        

sensor name: GSD_200cm
size of train images: 21
size of validation images: 4
size of test images: 5


100%|██████████| 21/21 [25:00<00:00, 71.46s/it] 
100%|██████████| 4/4 [04:03<00:00, 60.90s/it]
100%|██████████| 5/5 [07:27<00:00, 89.45s/it] 


sensor name: GSD_250cm
size of train images: 21
size of validation images: 4
size of test images: 5


100%|██████████| 21/21 [33:58<00:00, 97.07s/it]  
100%|██████████| 4/4 [01:39<00:00, 24.82s/it]
100%|██████████| 5/5 [04:18<00:00, 51.72s/it]


sensor name: GSD_300cm
size of train images: 21
size of validation images: 4
size of test images: 5


100%|██████████| 21/21 [38:37<00:00, 110.35s/it]
100%|██████████| 4/4 [02:14<00:00, 33.54s/it]
100%|██████████| 5/5 [02:02<00:00, 24.48s/it]


In [45]:
test_path = "/work/scorreacardo_umass_edu/DeepSatGSD/data/processed/train/GSD_300cm"
size = len([f for f in os.listdir(test_path) if f.startswith("row")])
print(f"size of training data for GSD_250cm: {size} chips of 256x256")

size of training data for GSD_250cm: 73306 chips of 256x256
