## Prepare the Dataset for Processing

### Import Libraries

In [3]:
import os
import h5py
import numpy as np
from tqdm import tqdm

### Define Directories

In [4]:
base_dir = "../dataset/dataset_ca_17"
sensor_dirs = ["GNDVI", "NDVI", "NDVI45", "OSAVI", "PSRI", "RGB"]
hdf5_path = os.path.join("../dataset/dataset_ca_17.hdf5")

### Define Mapping for Crop Types

In [5]:
crop_mapping = {
    "BARLEY": 0,
    "CANOLA": 1,
    "CORN": 2,
    "MIXEDWOOD": 3,
    "OAT": 4,
    "ORCHARD": 5,
    "PASTURE": 6,
    "POTATO": 7,
    "SOYBEAN": 8,
    "SPRING_WHEAT": 9,
}

### Parse Filenames

In [6]:
# Parse a filename of the format:
# POINT_<point>_<date>_<region>_<crop_type>_<sensor_type>.png
# Returns a dictionary with keys:
# POINT, DATE, REGION, CROP_TYPE, SENSOR_TYPE, LABEL
def parse_filename(filename):
    # Remove extension and split by underscore
    name, _ = os.path.splitext(filename)
    parts = name.split('_')
    
    # Check if the filename meets the expected pattern
    if len(parts) < 6 or parts[0] != "POINT":
        return None
    
    # Extract the fields
    point = int(parts[1])
    date = parts[2]
    region = parts[3]
    sensor_type = parts[-1]
    crop_type = "_".join(parts[4:-1])
    
    # Get the integer label using the mapping dictionary; default to -1 if not found
    label = crop_mapping.get(crop_type, -1)
    
    return {
        "POINT": point,
        "DATE": date,
        "REGION": region,
        "CROP_TYPE": crop_type,
        "SENSOR_TYPE": sensor_type,
        "LABEL": label
    }

### Collect Dataset from Subdirectories

In [7]:
# Group records by identifier: (POINT, DATE, REGION, CROP_TYPE)
groups = {}
for sensor_dir in sensor_dirs:
    sensor_path = os.path.join(base_dir, sensor_dir)
    if not os.path.isdir(sensor_path):
        continue
    for root, _, files in os.walk(sensor_path):
        for file in files:
            # Verify file type and metadata
            if not file.lower().endswith(('.png')):
                continue
            if not file.startswith("POINT_"):
                continue
            meta = parse_filename(file)
            if meta is None:
                continue
            
            # Use the common identifier tuple (POINT, DATE, REGION, CROP_TYPE)
            identifier = (meta["POINT"], meta["DATE"], meta["REGION"], meta["CROP_TYPE"])
            
            # Read the image file as raw bytes
            file_path = os.path.join(root, file)
            try:
                with open(file_path, 'rb') as f:
                    image_bytes = f.read()
            except Exception as e:
                print(f"Error reading {file_path}: {e}")
                continue
            
            # Create group entry if not already present; initialize sensor columns to None
            if identifier not in groups:
                groups[identifier] = {
                    "POINT": meta["POINT"],
                    "DATE": meta["DATE"],
                    "REGION": meta["REGION"],
                    "CROP_TYPE": meta["CROP_TYPE"],
                    "LABEL": meta["LABEL"],
                    "RGB": None,
                    "NDVI": None,
                    "NDVI45": None,
                    "OSAVI": None,
                    "PSRI": None,
                    "GNDVI": None,
                }
            
            # Determine the sensor type (force uppercase for consistency)
            sensor_type = meta["SENSOR_TYPE"].upper()
            if sensor_type in groups[identifier]:
                groups[identifier][sensor_type] = image_bytes
            else:
                print(f"Warning: Unrecognized sensor type '{sensor_type}' in file {file_path}")

print(f"Total groups collected: {len(groups)}")

Total groups collected: 14111


In [8]:
records = list(groups.values())
print(f"Total records: {len(records)}")

Total records: 14111


### Load Dataset into HDF5 file

In [9]:
# Extract metadata columns
points     = [rec["POINT"] for rec in records]
dates      = [rec["DATE"] for rec in records]
regions    = [rec["REGION"] for rec in records]
crop_types = [rec["CROP_TYPE"] for rec in records]
labels     = [rec["LABEL"] for rec in records]

# Helper function: Convert image bytes to uint8 numpy array (or empty array if None)
def convert_image_bytes(img_bytes):
    if img_bytes is None:
        return np.array([], dtype='uint8')
    return np.frombuffer(img_bytes, dtype='uint8')

# Convert sensor images
rgb_images    = [convert_image_bytes(rec["RGB"])    for rec in records]
ndvi_images   = [convert_image_bytes(rec["NDVI"])   for rec in records]
ndvi45_images = [convert_image_bytes(rec["NDVI45"]) for rec in records]
osavi_images  = [convert_image_bytes(rec["OSAVI"])  for rec in records]
psri_images   = [convert_image_bytes(rec["PSRI"])   for rec in records]
gndvi_images  = [convert_image_bytes(rec["GNDVI"])  for rec in records]

with h5py.File(hdf5_path, "w") as hf:
    # Create datasets for metadata
    hf.create_dataset("POINT", data=np.array(points, dtype=np.int64))
    hf.create_dataset("DATE", data=np.array(dates, dtype=h5py.string_dtype(encoding='utf-8')))
    hf.create_dataset("REGION", data=np.array(regions, dtype=h5py.string_dtype(encoding='utf-8')))
    hf.create_dataset("CROP_TYPE", data=np.array(crop_types, dtype=h5py.string_dtype(encoding='utf-8')))
    hf.create_dataset("LABEL", data=np.array(labels, dtype=np.int64))
    
    # Create variable-length datasets for sensor images using uint8 arrays
    vlen_uint8 = h5py.special_dtype(vlen=np.dtype('uint8'))
    hf.create_dataset("RGB",    (len(rgb_images),),    dtype=vlen_uint8)
    hf.create_dataset("NDVI",   (len(ndvi_images),),   dtype=vlen_uint8)
    hf.create_dataset("NDVI45", (len(ndvi45_images),), dtype=vlen_uint8)
    hf.create_dataset("OSAVI",  (len(osavi_images),),  dtype=vlen_uint8)
    hf.create_dataset("PSRI",   (len(psri_images),),   dtype=vlen_uint8)
    hf.create_dataset("GNDVI",  (len(gndvi_images),),  dtype=vlen_uint8)
    
    # Write sensor image data row by row
    for i in tqdm(range(len(records)), desc="Saving sensor images"):
        hf["RGB"][i]    = rgb_images[i]
        hf["NDVI"][i]   = ndvi_images[i]
        hf["NDVI45"][i] = ndvi45_images[i]
        hf["OSAVI"][i]  = osavi_images[i]
        hf["PSRI"][i]   = psri_images[i]
        hf["GNDVI"][i]  = gndvi_images[i]

print(f"Data saved to {hdf5_path}")

Saving sensor images: 100%|██████████| 14111/14111 [01:01<00:00, 230.43it/s]


Data saved to ../dataset/dataset_ca_17.hdf5


### Verify HDF5 Data

In [11]:
# Open the file and print out the keys
with h5py.File(hdf5_path, "r") as hf:
    print("Datasets in the file:")
    for key in hf.keys():
        print(f"{key}: {hf[key].shape}")

Datasets in the file:
CROP_TYPE: (14111,)
DATE: (14111,)
GNDVI: (14111,)
LABEL: (14111,)
NDVI: (14111,)
NDVI45: (14111,)
OSAVI: (14111,)
POINT: (14111,)
PSRI: (14111,)
REGION: (14111,)
RGB: (14111,)
