# Code for creating a huggingface dataset out of high-rez satellite images. 

Import packages

In [23]:
from PIL import Image as PILImage
import os
import evaluate
import numpy as np
import matplotlib.pyplot as plt
from patchify import patchify
from datasets import load_dataset, DatasetDict, Dataset, Image as HFImage

Login to hugging face if needed

In [2]:
import huggingface_hub

huggingface_hub.login(token = 'insert your token here')

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /home/sking11/.cache/huggingface/token
Login successful


Create your lists of training images, validation images, and test images.

In [3]:
#training dataset
image_paths_train = ['/explore/nobackup/people/sking11/MakingDinoDataset/gliht_1964_rgbi.tif', 
                     '/explore/nobackup/people/sking11/MakingDinoDataset/gliht_1965_rgbi.tif',
                     '/explore/nobackup/people/sking11/MakingDinoDataset/gliht_1966_rgbi.tif',
                     '/explore/nobackup/people/sking11/MakingDinoDataset/gliht_1967_rgbi.tif',
                     '/explore/nobackup/people/sking11/MakingDinoDataset/gliht_1968_rgbi.tif']

label_paths_train = ['/explore/nobackup/people/sking11/MakingDinoDataset/binarymasks/cleaned_1964_binarymask_600cluster.png', 
                     '/explore/nobackup/people/sking11/MakingDinoDataset/binarymasks/cleaned_1965_binarymask_600cluster.png',
                     '/explore/nobackup/people/sking11/MakingDinoDataset/binarymasks/cleaned_1966_binarymask_600cluster.png',
                     '/explore/nobackup/people/sking11/MakingDinoDataset/binarymasks/cleaned_1967_binarymask_400cluster.png',
                     '/explore/nobackup/people/sking11/MakingDinoDataset/binarymasks/cleaned_1968_binarymask_300cluster.png']

#cross validation dataset
image_paths_val = ['/explore/nobackup/people/sking11/MakingDinoDataset/gliht_1969_rgbi.tif']
label_paths_val = ['/explore/nobackup/people/sking11/MakingDinoDataset/binarymasks/cleaned_1969_binarymask_500cluster.png']

#test dataset
image_paths_test = ['/explore/nobackup/people/sking11/MakingDinoDataset/gliht_1970_rgbi.tif']
label_paths_test = ['/explore/nobackup/people/sking11/MakingDinoDataset/binarymasks/cleaned_1970_binarymask_600cluster.png']

This set of functions tiles the images and masks into your specified patch size, removes the any pairs of images and masks where the masks are "empty" (only contain 0 or 1 values), and saves the images and masks as lists of numpy arrays. 

In [4]:
def split_image_and_mask(image_path, mask_path, patch_size=256):
    # Load the image and mask
    image = np.array(PILImage.open(image_path).convert('RGB'))
    mask = np.array(PILImage.open(mask_path))

    # Ensure masks are 2D for simplicity
    if len(mask.shape) > 2 and mask.shape[-1] == 1:
        mask = mask[:, :, 0]

    # Split the image and mask into patches
    image_patches = patchify(image, (patch_size, patch_size, 3), step=patch_size)
    mask_patches = patchify(mask, (patch_size, patch_size), step=patch_size)
    
    print(f"Image patches shape: {image_patches.shape}")
    print(f"Mask patches shape: {mask_patches.shape}")
    
    return image_patches, mask_patches

def filter_patches(image_patches, mask_patches):
    all_img_patches = []
    all_mask_patches = []

    num_patches_x, num_patches_y = image_patches.shape[0], image_patches.shape[1]
    for i in range(num_patches_x):
        for j in range(num_patches_y):
            # Remove the extra dimension
            single_patch_img = image_patches[i, j, 0, :, :, :]  # (patch_size, patch_size, 3)
            single_patch_mask = mask_patches[i, j, :, :]         # (patch_size, patch_size)

            # Check if the mask patch contains only one unique value
            unique_values = np.unique(single_patch_mask)
            if len(unique_values) > 1:
                all_img_patches.append(single_patch_img)
                all_mask_patches.append(single_patch_mask)
    
    return np.array(all_img_patches), np.array(all_mask_patches)

def process_and_save_images(image_paths, mask_paths, patch_size=256):
    all_images = []
    all_masks = []

    for img_path, mask_path in zip(image_paths, mask_paths):
        image_patches, mask_patches = split_image_and_mask(img_path, mask_path, patch_size)
        
        # Filter out empty patches
        images, masks = filter_patches(image_patches, mask_patches)
        all_images.append(images)
        all_masks.append(masks)

    # Concatenate all valid image and mask patches
    all_images = np.concatenate(all_images)
    all_masks = np.concatenate(all_masks)

    # Filter out empty masks
    valid_indices = [i for i, mask in enumerate(all_masks) if mask.max() != 0]
    filtered_images = all_images[valid_indices]
    filtered_masks = all_masks[valid_indices]

    return filtered_images, filtered_masks

train_images, train_masks = process_and_save_images(image_paths_train, label_paths_train)
val_images, val_masks = process_and_save_images(image_paths_val, label_paths_val)
test_images, test_masks = process_and_save_images(image_paths_test, label_paths_test)



Image patches shape: (38, 48, 1, 256, 256, 3)
Mask patches shape: (38, 48, 256, 256)




Image patches shape: (39, 49, 1, 256, 256, 3)
Mask patches shape: (39, 49, 256, 256)




Image patches shape: (36, 48, 1, 256, 256, 3)
Mask patches shape: (36, 48, 256, 256)




Image patches shape: (37, 50, 1, 256, 256, 3)
Mask patches shape: (37, 50, 256, 256)




Image patches shape: (37, 50, 1, 256, 256, 3)
Mask patches shape: (37, 50, 256, 256)




Image patches shape: (37, 49, 1, 256, 256, 3)
Mask patches shape: (37, 49, 256, 256)




Image patches shape: (38, 48, 1, 256, 256, 3)
Mask patches shape: (38, 48, 256, 256)


Function to turn the lists of image and mask arrays into a training, validation, and test dataset.

In [8]:
def create_dataset(image_arrays, mask_arrays):
    image_list = []
    mask_list = []

    for img_array, mask_array in zip(image_arrays, mask_arrays):
        # Convert NumPy arrays to PIL images
        img = PILImage.fromarray(img_array)
        mask = PILImage.fromarray(mask_array)

        # Convert PIL images to a format that can be used in the Hugging Face dataset
        img = np.array(img)  # Convert back to NumPy if needed
        mask = np.array(mask)

        image_list.append(img)
        mask_list.append(mask)

    dataset = Dataset.from_dict({"image": image_list, "label": mask_list})
    dataset = dataset.cast_column("image", HFImage())
    dataset = dataset.cast_column("label", HFImage())

    return dataset

In [9]:
train_dataset = create_dataset(train_images, binary_train_masks)
val_dataset = create_dataset(val_images, binary_val_masks)
test_dataset = create_dataset(test_images, binary_test_masks)



Compile your train, validation, and test dataset into a single huggingface formatted dataset. 

In [12]:
dataset = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset,
  }
)

Upload your dataset to huggingface

In [14]:
dataset.push_to_hub('yourusername/yourdataset')

Uploading the dataset shards:   0%|          | 0/6 [00:00<?, ?it/s]
Map:   0%|          | 0/1067 [00:00<?, ? examples/s][A
Map: 100%|██████████| 1067/1067 [00:00<00:00, 1351.48 examples/s][A

Creating parquet from Arrow format:   0%|          | 0/11 [00:00<?, ?ba/s][A
Creating parquet from Arrow format:  18%|█▊        | 2/11 [00:00<00:00, 12.36ba/s][A
Creating parquet from Arrow format:  36%|███▋      | 4/11 [00:00<00:00, 11.26ba/s][A
Creating parquet from Arrow format:  55%|█████▍    | 6/11 [00:00<00:00, 11.06ba/s][A
Creating parquet from Arrow format:  73%|███████▎  | 8/11 [00:00<00:00, 11.97ba/s][A
Creating parquet from Arrow format: 100%|██████████| 11/11 [00:00<00:00, 12.22ba/s][A
Uploading the dataset shards:  17%|█▋        | 1/6 [00:11<00:55, 11.14s/it]
Map:   0%|          | 0/1067 [00:00<?, ? examples/s][A
Map: 100%|██████████| 1067/1067 [00:00<00:00, 1459.31 examples/s][A

Creating parquet from Arrow format:   0%|          | 0/11 [00:00<?, ?ba/s][A
Creating parquet 

CommitInfo(commit_url='https://huggingface.co/datasets/saking3/alaska_dead_trees/commit/1d42386b17ab1e911c537f48ad7960cc16179734', commit_message='Upload dataset', commit_description='', oid='1d42386b17ab1e911c537f48ad7960cc16179734', pr_url=None, pr_revision=None, pr_num=None)

test if your dataset is downloadable and view what info is contained in your dataset

In [15]:
dataset = load_dataset('saking3/alaska_dead_trees')

Downloading readme: 100%|██████████| 1.40k/1.40k [00:00<00:00, 13.3MB/s]
Downloading data: 100%|██████████| 156M/156M [00:03<00:00, 40.5MB/s] 
Downloading data: 100%|██████████| 152M/152M [00:04<00:00, 36.6MB/s] 
Downloading data: 100%|██████████| 158M/158M [00:03<00:00, 41.7MB/s] 
Downloading data: 100%|██████████| 161M/161M [00:04<00:00, 37.7MB/s] 
Downloading data: 100%|██████████| 154M/154M [00:03<00:00, 43.0MB/s] 
Downloading data: 100%|██████████| 153M/153M [00:03<00:00, 43.5MB/s] 
Downloading data: 100%|██████████| 90.3M/90.3M [00:02<00:00, 35.0MB/s]
Downloading data: 100%|██████████| 177M/177M [00:04<00:00, 36.4MB/s] 
Generating train split: 100%|██████████| 6400/6400 [00:05<00:00, 1188.70 examples/s]
Generating validation split: 100%|██████████| 653/653 [00:00<00:00, 1252.38 examples/s]
Generating test split: 100%|██████████| 1286/1286 [00:01<00:00, 1229.63 examples/s]


In [16]:
dataset

DatasetDict({
    train: Dataset({
        features: ['image', 'label'],
        num_rows: 6400
    })
    validation: Dataset({
        features: ['image', 'label'],
        num_rows: 653
    })
    test: Dataset({
        features: ['image', 'label'],
        num_rows: 1286
    })
})