In [8]:
import numpy as np
import os

def process_masks_to_npy(input_path, output_path, chunk_size=1000):
    """
    Process masks to binary format in smaller chunks and save in standard .npy format.
    
    Parameters:
    - input_path: Path to the input mask file
    - output_path: Path to save the processed mask file
    - chunk_size: Number of slices to process at a time
    """
    try:
        # Load input masks in memory-mapped mode
        masks = np.load(input_path, mmap_mode="r")
        num_slices = masks.shape[0]
        print(f"Loaded masks: {masks.shape}")

        # Process in chunks
        binary_masks = np.zeros_like(masks, dtype="int8")  # Prepare an output array
        for start_idx in range(0, num_slices, chunk_size):
            end_idx = min(start_idx + chunk_size, num_slices)
            chunk = masks[start_idx:end_idx]  # Load a chunk
            binary_masks[start_idx:end_idx] = np.where(chunk > 0, 1, 0)  # Convert to binary
            print(f"Processed slices {start_idx} to {end_idx - 1}")

        # Save the processed binary masks as a standard .npy file
        np.save(output_path, binary_masks)
        print(f"Binary masks saved successfully to {output_path}")

        # Validate the saved file
        cleaned_masks = np.load(output_path)
        print(f"Validation - Shape of cleaned masks: {cleaned_masks.shape}")
        print(f"Validation - Unique values in cleaned masks: {np.unique(cleaned_masks)}")
    
    except Exception as e:
        print(f"Error during processing: {e}")

# Define paths
mask_path = "../data/training_data/val/Residential_val_masks_combined.npy"
output_path = "../data/training_data/val/Residential_val_masks_cleaned.npy"

# Process the masks
process_masks_to_npy(mask_path, output_path)


Loaded masks: (123360, 257, 21)
Processed slices 0 to 999
Processed slices 1000 to 1999
Processed slices 2000 to 2999
Processed slices 3000 to 3999
Processed slices 4000 to 4999
Processed slices 5000 to 5999
Processed slices 6000 to 6999
Processed slices 7000 to 7999
Processed slices 8000 to 8999
Processed slices 9000 to 9999
Processed slices 10000 to 10999
Processed slices 11000 to 11999
Processed slices 12000 to 12999
Processed slices 13000 to 13999
Processed slices 14000 to 14999
Processed slices 15000 to 15999
Processed slices 16000 to 16999
Processed slices 17000 to 17999
Processed slices 18000 to 18999
Processed slices 19000 to 19999
Processed slices 20000 to 20999
Processed slices 21000 to 21999
Processed slices 22000 to 22999
Processed slices 23000 to 23999
Processed slices 24000 to 24999
Processed slices 25000 to 25999
Processed slices 26000 to 26999
Processed slices 27000 to 27999
Processed slices 28000 to 28999
Processed slices 29000 to 29999
Processed slices 30000 to 30999


In [9]:
import numpy as np
import cv2

# Paths
input_cleaned_masks = "../data/training_data/train/AnnualCrop_train_masks_cleaned.npy"
output_resized_masks = "../data/training_data/train/AnnualCrop_train_masks_cleaned_resized.npy"

# Load cleaned masks in memory-mapped mode
cleaned_masks = np.load(input_cleaned_masks, mmap_mode="r")

# Open a memory-mapped file for the output
num_samples = cleaned_masks.shape[0]
output_shape = (num_samples, 512, 512)
resized_masks = np.memmap(output_resized_masks, dtype="int8", mode="w+", shape=output_shape)

chunk_size = 1000  # Adjust chunk size based on memory availability

# Process masks in chunks
for start_idx in range(0, num_samples, chunk_size):
    end_idx = min(start_idx + chunk_size, num_samples)
    chunk = cleaned_masks[start_idx:end_idx]
    
    # Resize each mask in the chunk
    for i, mask in enumerate(chunk):
        resized_masks[start_idx + i] = cv2.resize(mask, (512, 512), interpolation=cv2.INTER_NEAREST)
    
    print(f"Processed and saved masks from index {start_idx} to {end_idx - 1}")

# Ensure changes are written to disk
resized_masks.flush()
print(f"Resized masks saved to {output_resized_masks}")


Processed and saved masks from index 0 to 999
Processed and saved masks from index 1000 to 1999
Processed and saved masks from index 2000 to 2999
Processed and saved masks from index 3000 to 3999
Processed and saved masks from index 4000 to 4999
Processed and saved masks from index 5000 to 5999
Processed and saved masks from index 6000 to 6999
Processed and saved masks from index 7000 to 7999
Processed and saved masks from index 8000 to 8999
Processed and saved masks from index 9000 to 9999
Processed and saved masks from index 10000 to 10999
Processed and saved masks from index 11000 to 11999
Processed and saved masks from index 12000 to 12999
Processed and saved masks from index 13000 to 13999
Processed and saved masks from index 14000 to 14999
Processed and saved masks from index 15000 to 15999
Processed and saved masks from index 16000 to 16999
Processed and saved masks from index 17000 to 17999
Processed and saved masks from index 18000 to 18999
Processed and saved masks from index

In [10]:
resized_masks = np.load(output_resized_masks, mmap_mode="r")
print(f"Resized masks shape: {resized_masks.shape}")


ValueError: Cannot load file containing pickled data when allow_pickle=False