## Crop Mapping with Deep Learning

### Import Libraries

In [None]:
import io
import h5py
import random
import numpy as np
import tensorflow as tf
from PIL import Image, ImageEnhance
from collections import defaultdict

from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
from imblearn.over_sampling import RandomOverSampler

### Parameter Configuration

In [None]:
# Dataset parameters
hdf5_path = "../dataset/dataset_ca_17.hdf5"
image_size = (65, 65)
sequence_length = 3
sensors = ["RGB", "NDVI45", "PSRI"]
crop_mapping = {
    "BARLEY": 0,
    "CANOLA": 1,
    "CORN": 2,
    "MIXEDWOOD": 3,
    "OAT": 4,
    "ORCHARD": 5,
    "PASTURE": 6,
    "POTATO": 7,
    "SOYBEAN": 8,
    "SPRING_WHEAT": 9,
}

# Split parameters
test_size = 0.3
val_size = 0.15

# Augmentation parameters
batch_size = 32
color_prob = 0.7
hflip_prob = 0.5
rotate_prob = 0.3
rotate_range = (-15, 15)

### Load and Verify Dataset

In [None]:
# Function to decode an image stored as a variable-length uint8 array
def decode_image(uint8_array, sensor_type):
    if uint8_array.size == 0:
        return None

    image_bytes = uint8_array.tobytes()

    try:
        image = Image.open(io.BytesIO(image_bytes))
        # Convert RGB images to full color.
        if sensor_type == "RGB":
            image = image.convert("RGB")
        # Other sensor types might be encoded as a single channel
        else:
            image = image.convert("L")
        return image
    except Exception as e:
        print(f"Error decoding {sensor_type} image:", e)
        return None

In [None]:
# Generator for lazy loading records from HDF5 file
def load_records(hdf5_path):
    with h5py.File(hdf5_path, "r") as hf:
        num_records = hf["POINT"].shape[0]
        for idx in range(num_records):
            record = {
                "DATE": hf["DATE"][idx],
                "POINT": hf["POINT"][idx],
                "REGION": hf["REGION"][idx],
                "LABEL": hf["LABEL"][idx],
            }
            for sensor in sensors:
                sensor_data = hf[sensor][idx]
                record[sensor] = decode_image(sensor_data, sensor)
            yield record


# Load records with memory-efficient generator
records = list(load_records(hdf5_path))

# Verification
selected_record = random.choice(records)
print("Selected Record Details:")
print(f"LABEL: {selected_record['LABEL']}")
for sensor in sensors:
    image = selected_record[sensor]
    print(f"{sensor}: {image.size if image else 'Missing'}")

### Create Sequences

In [None]:
# Create temporal sequences grouped by geographic points
def create_sequences(records):
    points = defaultdict(list)
    for record in records:
        points[record["POINT"]].append(record)

    sequences, labels = [], []
    for point_recs in points.values():
        point_recs.sort(key=lambda x: x["DATE"])
        if len({r["LABEL"] for r in point_recs}) != 1:
            continue

        label = point_recs[0]["LABEL"]
        sequences += [point_recs[i:i+sequence_length]
                      for i in range(len(point_recs) - sequence_length + 1)]
        labels += [label] * (len(point_recs) - sequence_length + 1)

    print(f"Created {len(sequences)} sequences from {len(points)} points")
    return sequences, labels


sequences, labels = create_sequences(records)

### Dataset Splitting

In [None]:
# Train/validation/test split with balanced oversampling.
def split_dataset(sequences, labels):
    points = [seq[0]["POINT"] for seq in sequences]
    unique_points = list(set(points))

    # Initial split
    train_points, test_points = train_test_split(
        unique_points, test_size=test_size, random_state=42)
    train_points, val_points = train_test_split(
        train_points, test_size=val_size/(1-test_size), random_state=42)

    # Oversample training data
    train_indices = [i for i, p in enumerate(points) if p in train_points]
    ros = RandomOverSampler(sampling_strategy='not minority')
    resampled_indices, _ = ros.fit_resample(
        np.array(train_indices).reshape(-1, 1), [labels[i] for i in train_indices])

    return (
        [sequences[i] for i in resampled_indices.flatten()],
        [labels[i] for i in resampled_indices.flatten()],
        [sequences[i] for i, p in enumerate(points) if p in val_points],
        [labels[i] for i, p in enumerate(points) if p in val_points],
        [sequences[i] for i, p in enumerate(points) if p in test_points],
        [labels[i] for i, p in enumerate(points) if p in test_points]
    )


X_train, y_train, X_val, y_val, X_test, y_test = split_dataset(
    sequences, labels)

### Data Augmentation

In [None]:
# Apply augmentations to an image based on sensor type.
def augment_image(image, sensor):
    if sensor == "RGB" and random.random() < color_prob:
        image = ImageEnhance.Color(image).enhance(random.uniform(0.8, 1.2))

    if random.random() < hflip_prob:
        image = image.transpose(Image.FLIP_LEFT_RIGHT)

    if random.random() < rotate_prob:
        image = image.rotate(random.randint(*rotate_range))

    return image

### Preprocessing

In [None]:
# Convert sequence of records to normalized numpy array.
def preprocess_sequence(sequence, sensors, image_size, augment=False):
    processed = []
    for record in sequence:
        channels = []
        for sensor in sensors:
            image = record[sensor].copy()
            if augment:
                image = augment_image(image, sensor)
            image = image.resize(image_size)
            image_array = np.array(image) / 255.0
            if image_array.ndim == 2:
                image_array = np.expand_dims(image_array, axis=-1)
            channels.append(image_array)
        processed.append(np.concatenate(channels, axis=-1))

    return np.array(processed)

###  TensorFlow Dataset Pipeline

In [None]:
def create_tf_dataset(sequences, labels, sensors, image_size, batch_size, augment=False):
    def _py_preprocess(seq):
        return preprocess_sequence(seq.numpy(), sensors, image_size, augment)

    dataset = tf.data.Dataset.from_tensor_slices((sequences, labels))
    dataset = dataset.map(
        lambda x, y: (tf.py_function(_py_preprocess, [x], tf.float32), y),
        num_parallel_calls=tf.data.AUTOTUNE
    )
    dataset = dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return dataset


# Create TensorFlow Datasets
train_ds = create_tf_dataset(
    X_train, y_train, sensors=sensors, image_size=image_size, augment=True, batch_size=batch_size)
val_ds = create_tf_dataset(
    X_val, y_val, sensors=sensors, image_size=image_size, batch_size=batch_size)
test_ds = create_tf_dataset(
    X_test, y_test, sensors=sensors, image_size=image_size, batch_size=batch_size)

### Verification

In [None]:
# Calculate class weights
class_weights = class_weight.compute_class_weight('balanced',
                                                  classes=np.unique(y_train),
                                                  y=y_train)
class_weights = dict(enumerate(class_weights))

print("Dataset Statistics:")
print(f"Train: {len(X_train)} samples")
print(f"Validation: {len(X_val)} samples")
print(f"Test: {len(X_test)} samples")
print("\nClass Weights:", class_weights)
print("\nSample Input Shape:", next(iter(train_ds))[0].shape)