In [1]:
from datasets import load_dataset

dataset = load_dataset("blanchon/EuroSAT_RGB")


README.md:   0%|          | 0.00/3.38k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/105M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/34.8M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/34.8M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/16200 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5400 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5400 [00:00<?, ? examples/s]

In [2]:
dataset

DatasetDict({
    train: Dataset({
        features: ['image', 'label', 'filename'],
        num_rows: 16200
    })
    test: Dataset({
        features: ['image', 'label', 'filename'],
        num_rows: 5400
    })
    validation: Dataset({
        features: ['image', 'label', 'filename'],
        num_rows: 5400
    })
})

## Save the dataset locally

In [6]:
from pathlib import Path
from loguru import logger
from datasets import DatasetDict

SAVE_DIR = Path("data/eurosat_rgb")
NUM_PROC = 8  
CLASS_NAMES = dataset['train'].features['label'].names

def save_image(example, idx):
    """Save a dataset image to disk with error handling and add label name.
    
    Args:
        example (dict): Dataset example containing 'image' and 'label'
        idx (int): Index of the example
        
    Returns:
        dict: Dictionary containing the saved filepath and label_name
    """
    try:
        image = example['image']
        label_id = example.get('label')  # Get label if it exists (won't exist for unlabeled)
        
        # Get label name if label exists
        label_name = CLASS_NAMES[label_id] if label_id is not None else None
        
        # Create directory structure
        label_dir = SAVE_DIR / str(label_id if label_id is not None else 'unlabeled')
        label_dir.mkdir(parents=True, exist_ok=True)
        
        # Create filename with label subdirectory
        filename = example['filename'].split('.')[0]
        filepath = label_dir / f"{filename}.jpg"
        
        # Save with quality optimization
        image.save(filepath, "JPEG", quality=95, optimize=True)
        
        return {
            "filepath": str(filepath),
            "label_name": label_name
        }
    
    except Exception as e:
        logger.error(f"Error saving image {idx}: {str(e)}")
        return {
            "filepath": None,
            "label_name": None
        }

active_learning_dataset = DatasetDict()

active_learning_dataset['evaluation'] = dataset['test'].map(
    save_image,
    with_indices=True,
    num_proc=NUM_PROC,
    desc="Saving evaluation images",
    remove_columns="label"
)

active_learning_dataset['unlabeled'] = dataset['train'].map(
    save_image,
    with_indices=True,
    num_proc=NUM_PROC,
    desc="Saving unlabeled images",
    remove_columns="label"
)

Saving evaluation images (num_proc=8):   0%|          | 0/5400 [00:00<?, ? examples/s]

Saving unlabeled images (num_proc=8):   0%|          | 0/16200 [00:00<?, ? examples/s]

In [7]:
active_learning_dataset

DatasetDict({
    evaluation: Dataset({
        features: ['image', 'filename', 'filepath', 'label_name'],
        num_rows: 5400
    })
    unlabeled: Dataset({
        features: ['image', 'filename', 'filepath', 'label_name'],
        num_rows: 16200
    })
})

## Make Initial Samples

In [8]:
import numpy as np

unique_labels = active_learning_dataset["unlabeled"].unique("label_name")
samples = []
n_samples_per_class = 10

for label in unique_labels:
    label_indices = np.where(np.array(active_learning_dataset["unlabeled"]["label_name"]) == label)[0]
    # Sample 10 random indices without replacement
    random_indices = np.random.choice(
        label_indices, size=n_samples_per_class, replace=False
    )
    samples.extend(random_indices)

initial_samples = active_learning_dataset["unlabeled"].select(samples)

# Verify the result (should show 100 rows total, 10 per class)
print(f"Total samples: {len(initial_samples)}")
print("\nSamples per class:")
print(initial_samples.select_columns(["label_name"]).to_pandas().value_counts())

Total samples: 100

Samples per class:
label_name           
Annual Crop              10
Forest                   10
Herbaceous Vegetation    10
Highway                  10
Industrial Buildings     10
Pasture                  10
Permanent Crop           10
Residential Buildings    10
River                    10
SeaLake                  10
Name: count, dtype: int64


In [9]:
initial_samples

Dataset({
    features: ['image', 'filename', 'filepath', 'label_name'],
    num_rows: 100
})

In [10]:
initial_samples = initial_samples.remove_columns("image")
df = initial_samples.to_pandas()
df = df.rename(columns={"label_name": "label"})
df

Unnamed: 0,filename,filepath,label
0,AnnualCrop_2216.tif,data/eurosat_rgb/0/AnnualCrop_2216.jpg,Annual Crop
1,AnnualCrop_1331.tif,data/eurosat_rgb/0/AnnualCrop_1331.jpg,Annual Crop
2,AnnualCrop_631.tif,data/eurosat_rgb/0/AnnualCrop_631.jpg,Annual Crop
3,AnnualCrop_2904.tif,data/eurosat_rgb/0/AnnualCrop_2904.jpg,Annual Crop
4,AnnualCrop_1963.tif,data/eurosat_rgb/0/AnnualCrop_1963.jpg,Annual Crop
...,...,...,...
95,SeaLake_1012.tif,data/eurosat_rgb/9/SeaLake_1012.jpg,SeaLake
96,SeaLake_735.tif,data/eurosat_rgb/9/SeaLake_735.jpg,SeaLake
97,SeaLake_561.tif,data/eurosat_rgb/9/SeaLake_561.jpg,SeaLake
98,SeaLake_2407.tif,data/eurosat_rgb/9/SeaLake_2407.jpg,SeaLake


In [11]:
df.to_parquet("data/eurosat_rgb/initial_samples.parquet")

## Make Evaluation Split

In [12]:
eval_samples = active_learning_dataset["evaluation"].remove_columns("image")
eval_samples = eval_samples.rename_column("label_name", "label")
eval_samples

Dataset({
    features: ['filename', 'filepath', 'label'],
    num_rows: 5400
})

In [13]:
df = eval_samples.to_pandas()
df

Unnamed: 0,filename,filepath,label
0,AnnualCrop_1002.tif,data/eurosat_rgb/0/AnnualCrop_1002.jpg,Annual Crop
1,AnnualCrop_1003.tif,data/eurosat_rgb/0/AnnualCrop_1003.jpg,Annual Crop
2,AnnualCrop_1007.tif,data/eurosat_rgb/0/AnnualCrop_1007.jpg,Annual Crop
3,AnnualCrop_1012.tif,data/eurosat_rgb/0/AnnualCrop_1012.jpg,Annual Crop
4,AnnualCrop_1015.tif,data/eurosat_rgb/0/AnnualCrop_1015.jpg,Annual Crop
...,...,...,...
5395,SeaLake_975.tif,data/eurosat_rgb/9/SeaLake_975.jpg,SeaLake
5396,SeaLake_983.tif,data/eurosat_rgb/9/SeaLake_983.jpg,SeaLake
5397,SeaLake_989.tif,data/eurosat_rgb/9/SeaLake_989.jpg,SeaLake
5398,SeaLake_990.tif,data/eurosat_rgb/9/SeaLake_990.jpg,SeaLake


In [14]:
df.to_parquet("data/eurosat_rgb/evaluation_samples.parquet")

## Make Unlabeled Split

In [15]:
unlabeled_samples = active_learning_dataset["unlabeled"].remove_columns("image")
unlabeled_samples = unlabeled_samples.rename_column("label_name", "label")

df = unlabeled_samples.to_pandas()
df


Unnamed: 0,filename,filepath,label
0,AnnualCrop_1.tif,data/eurosat_rgb/0/AnnualCrop_1.jpg,Annual Crop
1,AnnualCrop_10.tif,data/eurosat_rgb/0/AnnualCrop_10.jpg,Annual Crop
2,AnnualCrop_100.tif,data/eurosat_rgb/0/AnnualCrop_100.jpg,Annual Crop
3,AnnualCrop_1000.tif,data/eurosat_rgb/0/AnnualCrop_1000.jpg,Annual Crop
4,AnnualCrop_1001.tif,data/eurosat_rgb/0/AnnualCrop_1001.jpg,Annual Crop
...,...,...,...
16195,SeaLake_993.tif,data/eurosat_rgb/9/SeaLake_993.jpg,SeaLake
16196,SeaLake_994.tif,data/eurosat_rgb/9/SeaLake_994.jpg,SeaLake
16197,SeaLake_995.tif,data/eurosat_rgb/9/SeaLake_995.jpg,SeaLake
16198,SeaLake_996.tif,data/eurosat_rgb/9/SeaLake_996.jpg,SeaLake


Remove the initial samples from the unlabeled samples

In [16]:
initial_filepaths = set(initial_samples['filepath'])
initial_filepaths

{'data/eurosat_rgb/0/AnnualCrop_1331.jpg',
 'data/eurosat_rgb/0/AnnualCrop_1842.jpg',
 'data/eurosat_rgb/0/AnnualCrop_1963.jpg',
 'data/eurosat_rgb/0/AnnualCrop_2216.jpg',
 'data/eurosat_rgb/0/AnnualCrop_2783.jpg',
 'data/eurosat_rgb/0/AnnualCrop_2904.jpg',
 'data/eurosat_rgb/0/AnnualCrop_545.jpg',
 'data/eurosat_rgb/0/AnnualCrop_631.jpg',
 'data/eurosat_rgb/0/AnnualCrop_634.jpg',
 'data/eurosat_rgb/0/AnnualCrop_928.jpg',
 'data/eurosat_rgb/1/Forest_1372.jpg',
 'data/eurosat_rgb/1/Forest_1712.jpg',
 'data/eurosat_rgb/1/Forest_1787.jpg',
 'data/eurosat_rgb/1/Forest_2120.jpg',
 'data/eurosat_rgb/1/Forest_2657.jpg',
 'data/eurosat_rgb/1/Forest_2937.jpg',
 'data/eurosat_rgb/1/Forest_391.jpg',
 'data/eurosat_rgb/1/Forest_605.jpg',
 'data/eurosat_rgb/1/Forest_616.jpg',
 'data/eurosat_rgb/1/Forest_9.jpg',
 'data/eurosat_rgb/2/HerbaceousVegetation_1418.jpg',
 'data/eurosat_rgb/2/HerbaceousVegetation_1539.jpg',
 'data/eurosat_rgb/2/HerbaceousVegetation_1569.jpg',
 'data/eurosat_rgb/2/Herbaceous

In [17]:
# Filter out rows that are in initial_samples
unlabeled_samples = unlabeled_samples.filter(
    lambda x: x['filepath'] not in initial_filepaths
)

unlabeled_samples

Filter:   0%|          | 0/16200 [00:00<?, ? examples/s]

Dataset({
    features: ['filename', 'filepath', 'label'],
    num_rows: 16100
})

In [18]:
df = unlabeled_samples.to_pandas()
df
df.to_parquet("data/eurosat_rgb/unlabeled_samples.parquet")
