In [None]:
import rasterio
import numpy as np

# Define class mapping
simplified_mapping = {
    10: 'Vegetation',
    20: 'Vegetation',
    30: 'Vegetation',
    40: 'Cropland',
    50: 'Built-up',
    80: 'Water',
    90: 'Wetland'
}

# Assign numeric IDs to simplified classes
label_map = {name: idx for idx, name in enumerate(sorted(set(simplified_mapping.values())))}
print("Label Map:", label_map)

# Reverse mapping for raster values
reverse_map = {k: label_map[v] for k, v in simplified_mapping.items()}

# Load the aligned WorldCover raster
with rasterio.open('../data/labels/worldcover_aligned.tif') as src:
    wc_raw = src.read(1).astype(np.int16)
    wc_raw[wc_raw == 255] = -1  # remove nodata before mapping
    profile = src.profile

# Initialize a label array with -1 (to mask out unwanted classes)
label_array = np.full_like(wc_raw, fill_value=-1)

# Remap original IDs to numeric classes
for original_id, new_id in reverse_map.items():
    label_array[wc_raw == original_id] = new_id

# Optionally: check class distribution
unique, counts = np.unique(label_array[label_array >= 0], return_counts=True)
print("Class counts:", dict(zip(unique, counts)))


In [None]:
import random
import pandas as pd

# Make sure composite is loaded: shape = (bands, height, width)
with rasterio.open('../data/raw/barishal_composite_2022.tif') as src:
    composite_array = src.read().astype(np.float32)  # shape: (bands, H, W)

# Parameters
samples_per_class = 5000
n_bands = composite_array.shape[0]
features = []
labels = []

# Loop over class labels 0 to 4
for cls in range(5):
    y_indices, x_indices = np.where(label_array == cls)
    
    coords = list(zip(y_indices, x_indices))
    
    # Shuffle and sample
    if len(coords) >= samples_per_class:
        sampled_coords = random.sample(coords, samples_per_class)
    else:
        print(f"⚠️ Warning: only {len(coords)} pixels for class {cls}. Sampling all.")
        sampled_coords = coords

    for y, x in sampled_coords:
        pixel_features = composite_array[:, y, x]  # extract band values
        if np.any(np.isnan(pixel_features)):
            continue  # skip if pixel has NaN
        features.append(pixel_features)
        labels.append(cls)

# Convert to arrays
X = np.array(features)
y = np.array(labels)

print("✅ Training samples shape:", X.shape)
print("✅ Labels shape:", y.shape)

np.save('../data/interim/X.npy', X)
np.save('../data/interim/y.npy', y)