In [12]:
import os
import random
import shutil
import rasterio
from rasterio.windows import Window
from tqdm import tqdm
tqdm.pandas()

In [14]:
def create_chips(image_path, output_dir):
    with rasterio.open(image_path) as dataset:
        height, width = dataset.shape

        for row in range(0, height, chip_size):
            for col in range(0, width, chip_size):
                chip_window = Window(col, row, chip_size, chip_size)
                chip = dataset.read(window=chip_window)

                chip_filename = f"row{row}_col{col}_" + os.path.splitext(os.path.basename(image_path))[0] + ".tif"
                chip_filepath = os.path.join(output_dir, chip_filename)

                with rasterio.open(
                    chip_filepath,
                    'w',
                    driver='GTiff',
                    height=chip_size,
                    width=chip_size,
                    count=dataset.count,
                    dtype=dataset.dtypes[0],
                    crs=dataset.crs,
                    transform=rasterio.windows.transform(chip_window, dataset.transform)
                ) as chip_dataset:
                    chip_dataset.write(chip, indexes=list(range(1, dataset.count + 1)))

In [15]:
image_path ='/work/scorreacardo_umass_edu/DeepSatGSD/data/interim/GE01/AOP_AF17_Q317_V0_502_307_134_9_R8C6_2010-09-29_GE01.tif'
os.path.splitext(os.path.basename(image_path))

('AOP_AF17_Q317_V0_502_307_134_9_R8C6_2010-09-29_GE01', '.tif')

In [16]:
os.listdir(original_data_dir)

['.ipynb_checkpoints',
 'GE01',
 'QB02',
 'WV02',
 'WV03_VNIR',
 'dg_metadata.gpkg',
 'dg_metadata_sensorcount3.gpkg',
 'dg_metadata_sensorcount4.gpkg']

In [17]:
# Set the paths to your original data directory and the destination dataset directory
original_data_dir = "/work/scorreacardo_umass_edu/DeepSatGSD/data/interim"
dataset_dir = "/work/scorreacardo_umass_edu/DeepSatGSD/data/processed"

In [18]:
sensors = ['GE01', 'QB02', 'WV02', 'WV03_VNIR']

In [10]:
[len(os.listdir(os.path.join(original_data_dir, sensor))) for sensor in sensors]

[35, 31, 30, 25]

In [19]:
# Define the train, validation, and test ratios
train_ratio = 0.7
validation_ratio = 0.15
test_ratio = 0.15

# Set the chip size
chip_size = 512

# Set a random seed for reproducibility
random.seed(42)

In [20]:
# Create the directories for train, validation, and test sets
train_dir = os.path.join(dataset_dir, 'train')
validation_dir = os.path.join(dataset_dir, 'validation')
test_dir = os.path.join(dataset_dir, 'test')

os.makedirs(train_dir, exist_ok=True)
os.makedirs(validation_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)

In [23]:
# Loop through each sensor and distribute the images across the sets
sensors = ['GE01', 'QB02', 'WV02', 'WV03_VNIR']

for sensor_name in sensors:
    print(f"sensor name: {sensor_name}")
    sensor_images = [filename for filename in os.listdir(os.path.join(original_data_dir, sensor_name))]
    random.shuffle(sensor_images)

    train_count = int(len(sensor_images) * train_ratio)
    validation_count = int(len(sensor_images) * validation_ratio)
    test_count = len(sensor_images) - train_count - validation_count

    train_images = sensor_images[:train_count]
    print(f"size of train images: {len(train_images)}")
    validation_images = sensor_images[train_count:train_count + validation_count]
    print(f"size of validation images: {len(validation_images)}")
    test_images = sensor_images[train_count + validation_count:]
    print(f"size of test images: {len(test_images)}")
    
    # Create the sub-directories for train, validation, and test sets for each sensor
    sensor_train_dir = os.path.join(train_dir, sensor_name)
    sensor_validation_dir = os.path.join(validation_dir, sensor_name)
    sensor_test_dir = os.path.join(test_dir, sensor_name)
    
    os.makedirs(sensor_train_dir, exist_ok=True)
    os.makedirs(sensor_validation_dir, exist_ok=True)
    os.makedirs(sensor_test_dir, exist_ok=True)
    
    # Move the images to their respective directories and create chips
    for image in tqdm(train_images):
        src = os.path.join(original_data_dir + f"/{sensor_name}", image)
        dst = os.path.join(sensor_train_dir, image)
        shutil.copyfile(src, dst)
        create_chips(dst, sensor_train_dir)

    for image in tqdm(validation_images):
        src = os.path.join(original_data_dir + f"/{sensor_name}", image)
        dst = os.path.join(sensor_validation_dir, image)
        shutil.copyfile(src, dst)
        create_chips(dst, sensor_validation_dir)

    for image in tqdm(test_images):
        src = os.path.join(original_data_dir + f"/{sensor_name}", image)
        dst = os.path.join(sensor_test_dir, image)
        shutil.copyfile(src, dst)
        create_chips(dst, sensor_test_dir)
        

sensor name: GE01
size of train images: 24
size of validation images: 5
size of test images: 6


100%|██████████| 24/24 [07:22<00:00, 18.45s/it]
100%|██████████| 5/5 [00:49<00:00,  9.91s/it]
100%|██████████| 6/6 [01:04<00:00, 10.78s/it]


sensor name: QB02
size of train images: 21
size of validation images: 4
size of test images: 6


100%|██████████| 21/21 [03:47<00:00, 10.84s/it]
100%|██████████| 4/4 [00:52<00:00, 13.05s/it]
100%|██████████| 6/6 [01:14<00:00, 12.46s/it]


sensor name: WV02
size of train images: 21
size of validation images: 4
size of test images: 5


100%|██████████| 21/21 [04:09<00:00, 11.86s/it]
100%|██████████| 4/4 [01:06<00:00, 16.63s/it]
100%|██████████| 5/5 [00:45<00:00,  9.07s/it]


sensor name: WV03_VNIR
size of train images: 17
size of validation images: 3
size of test images: 5


100%|██████████| 17/17 [03:55<00:00, 13.84s/it]
100%|██████████| 3/3 [01:06<00:00, 22.24s/it]
100%|██████████| 5/5 [01:20<00:00, 16.15s/it]


In [28]:
test_path = "/work/scorreacardo_umass_edu/DeepSatGSD/data/processed/train/GE01"
size = len([f for f in os.listdir(test_path) if f.startswith("row")])
print(f"size of training data for GEO1: {size} chips of 512x512")

size of training data for GEO1: 16189
