## Crop Mapping with Deep Learning

### Import Libraries

In [None]:
import io
import h5py
import random
import numpy as np
import pandas as pd
from PIL import Image, ImageEnhance
from collections import defaultdict

from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler

In [None]:
hdf5_path = "../dataset/dataset_ca_17.hdf5"
image_size = (224, 224)
sequence_length = 3
sensors = ["RGB", "NDVI45", "PSRI"]
crop_mapping = {
    "BARLEY": 0,
    "CANOLA": 1,
    "CORN": 2,
    "MIXEDWOOD": 3,
    "OAT": 4,
    "ORCHARD": 5,
    "PASTURE": 6,
    "POTATO": 7,
    "SOYBEAN": 8,
    "SPRING_WHEAT": 9,
}

### Load and Verify Dataset

In [None]:
# Function to decode an image stored as a variable-length uint8 array
def decode_image(uint8_array, sensor_type):
    if uint8_array.size == 0:
        return None
    
    img_bytes = uint8_array.tobytes()

    try:
        image = Image.open(io.BytesIO(img_bytes))
        # Convert RGB images to full color.
        if sensor_type == "RGB":
            image = image.convert("RGB")
        # Other sensor types might be encoded as a single channel
        else:
            image = image.convert("L")
        return image
    except Exception as e:
        print(f"Error decoding {sensor_type} image:", e)
        return None

In [None]:
# Container for all records
records = []

with h5py.File(hdf5_path, "r") as hf:
    # List available datasets for verification
    print("Datasets available in the HDF5 file:")
    for key in hf.keys():
        print(f"{key}: {hf[key].shape}")

    num_records = hf["POINT"].shape[0]
    print("\nTotal number of records:", num_records)

    # Iterate over each record
    for idx in range(num_records):
        record = {}

        date = hf["DATE"][idx]
        point = hf["POINT"][idx]
        region = hf["REGION"][idx]
        label = hf["LABEL"][idx]
        
        record["DATE"] = date
        record["POINT"] = point
        record["REGION"] = region
        record["LABEL"] = label

        # Extract and decode images for each sensor
        for sensor in sensors:
            sensor_data = hf[sensor][idx]
            image = decode_image(sensor_data, sensor)
            record[sensor] = image

        records.append(record)

# Print out details for a randomly selected record
selected_record = random.choice(records)
print("\nDetails for a randomly selected record:")
print(f"LABEL: {selected_record['LABEL']}")
print(f"DATE: {selected_record['DATE']}")
print(f"POINT: {selected_record['POINT']}")
print(f"REGION: {selected_record['REGION']}")
for sensor in sensors:
    image = selected_record[sensor]
    if image is not None:
        print(f"{sensor}: decoded image shape = {image.size}, mode = {image.mode}")
    else:
        print(f"{sensor}: No image data or decoding failed.")

### Group Records by Point

In [None]:
points_dict = defaultdict(list)
for record in records:
    points_dict[record['POINT']].append(record)

# Process each point's records to generate sequences
sequences = []
labels = []
points_sequences = []

for point, point_records in points_dict.items():
    # Sort records by date
    point_records_sorted = sorted(point_records, key=lambda x: x['DATE'])
    
    # Check if all labels are consistent for the point
    unique_labels = set(r['LABEL'] for r in point_records_sorted)
    if len(unique_labels) != 1:
        continue
    
    # Generate sequences of 3 consecutive records
    if len(point_records_sorted) >= 3:
        label = point_records_sorted[0]['LABEL']
        for i in range(len(point_records_sorted) - 2):
            sequences.append(point_records_sorted[i:i+3])
            labels.append(label)
            points_sequences.append(point)

### Split Train, Validation, and Test

In [None]:
# Initial point-based split
unique_points = list(set(points_sequences))
train_points, temp_points = train_test_split(unique_points, test_size=0.3, random_state=42)
val_points, test_points = train_test_split(temp_points, test_size=0.5, random_state=42)

# Balance training set
ros = RandomOverSampler(sampling_strategy='not minority')
_, _ = ros.fit_resample(np.array(train_points).reshape(-1, 1), [labels[points_sequences.index(p)] for p in train_points])
balanced_train_points = [p[0] for p in ros.sample_indices_]

# Create final indices
train_indices = [i for i, p in enumerate(points_sequences) if p in balanced_train_points]
val_indices = [i for i, p in enumerate(points_sequences) if p in val_points]
test_indices = [i for i, p in enumerate(points_sequences) if p in test_points]

X_train = [sequences[i] for i in train_indices]
y_train = [labels[i] for i in train_indices]
X_val = [sequences[i] for i in val_indices]
y_val = [labels[i] for i in val_indices]
X_test = [sequences[i] for i in test_indices]
y_test = [labels[i] for i in test_indices]

### Data Augmentation

In [None]:
def augment_image(img, sensor, label, class_counts):
    if sensor == "RGB":
        # Color augmentations only for RGB
        if random.random() < 0.7 * (1 - class_counts[label]/max(class_counts)):
            img = ImageEnhance.Color(img).enhance(random.uniform(0.8, 1.2))
    # Geometric augmentations for all sensors
    if random.random() < 0.5 * (1 - class_counts[label]/max(class_counts)):
        img = img.transpose(Image.FLIP_LEFT_RIGHT)
    if random.random() < 0.3 * (1 - class_counts[label]/max(class_counts)):
        img = img.rotate(random.randint(-15, 15))
    return img

### Preprocessing

In [None]:
def process_sequence(sequence, augment=False):
    class_counts = np.bincount(y_train)
    processed_seq = []
    
    for record in sequence:
        channels = []
        for sensor in sensors:
            img = record[sensor].copy()
            if augment:
                img = augment_image(img, sensor, record["LABEL"], class_counts)
            img = img.resize(image_size)
            
            if sensor == "RGB":
                img_array = np.array(img) / 255.0
            else:
                img_array = np.array(img.convert("L"))[:, :, np.newaxis] / 255.0
                
            channels.append(img_array)
        
        combined = np.concatenate(channels, axis=-1)
        processed_seq.append(combined)
    
    return np.array(processed_seq)

# Calculate class weights for reference
class_counts = np.bincount(y_train)
class_weights = {i: sum(class_counts)/count for i, count in enumerate(class_counts)}

# Process datasets
X_train_np = np.array([process_sequence(seq, augment=True) for seq in X_train])
X_val_np = np.array([process_sequence(seq) for seq in X_val])
X_test_np = np.array([process_sequence(seq) for seq in X_test])

# Assign labels
y_train_np = np.array(y_train)
y_val_np = np.array(y_val)
y_test_np = np.array(y_test)

### Verification

In [None]:
print("Final Dataset Shapes:")
print(f"Train: {X_train_np.shape}, {y_train_np.shape}")
print(f"Val: {X_val_np.shape}, {y_val_np.shape}")
print(f"Test: {X_test_np.shape}, {y_test_np.shape}")

print("\nClass Distributions:")
print(f"Train: {np.unique(y_train_np, return_counts=True)}")
print(f"Val: {np.unique(y_val_np, return_counts=True)}")
print(f"Test: {np.unique(y_test_np, return_counts=True)}")

print("\nClass Weights:", class_weights)