In [1]:
from datasets import load_dataset

dataset = load_dataset("AI-Lab-Makerere/beans")


In [2]:
dataset

DatasetDict({
    train: Dataset({
        features: ['image_file_path', 'image', 'labels'],
        num_rows: 1034
    })
    validation: Dataset({
        features: ['image_file_path', 'image', 'labels'],
        num_rows: 133
    })
    test: Dataset({
        features: ['image_file_path', 'image', 'labels'],
        num_rows: 128
    })
})

## Save Dataset Into a Local Directory

In [3]:
from pathlib import Path
from loguru import logger
from datasets import DatasetDict
from PIL import Image

SAVE_DIR = Path("data/beans")
NUM_PROC = 8  
CLASS_NAMES = dataset['train'].features['labels'].names

def save_image(example, idx):
    """Save a dataset image to disk with error handling and add label name.
    
    Args:
        example (dict): Dataset example containing 'image' and 'label'
        idx (int): Index of the example
        
    Returns:
        dict: Dictionary containing the saved filepath and label_name
    """
    try:
        image = example['image']
        label_id = example.get('labels')  # Get label if it exists 
        image_path = example.get('image_file_path')
        image_id = image_path.split('/')[-1]
        image_id = image_id.split('.jpg')[0]
        # Convert RGBA to RGB if necessary
        if image.mode == 'RGBA':
            # Create a white background
            background = Image.new('RGB', image.size, (255, 255, 255))
            # Paste the image using alpha channel as mask
            background.paste(image, mask=image.split()[3])
            image = background
        
        # Get label name if label exists
        label_name = CLASS_NAMES[label_id] if label_id is not None else None
        
        # Create directory structure
        label_dir = SAVE_DIR / str(label_id if label_id is not None else 'unlabeled')
        label_dir.mkdir(parents=True, exist_ok=True)
        
        # Create filename with label subdirectory
        filepath = label_dir / f"{image_id}.jpg"
        
        # Save with quality optimization
        image.save(filepath, "JPEG", quality=95, optimize=True)
        
        return {
            "filepath": str(filepath),
            "label_name": label_name
        }
    
    except Exception as e:
        logger.error(f"Error saving image {idx}: {str(e)}")
        return {
            "filepath": None,
            "label_name": None
        }

active_learning_dataset = DatasetDict()

active_learning_dataset['evaluation'] = dataset['test'].map(
    save_image,
    with_indices=True,
    num_proc=NUM_PROC,
    desc="Saving evaluation images",
    remove_columns=["image_file_path", "labels"]
)

active_learning_dataset['unlabeled'] = dataset['train'].map(
    save_image,
    with_indices=True,
    num_proc=NUM_PROC,
    desc="Saving unlabeled images",
    remove_columns=["image_file_path", "labels"]
)



Saving evaluation images (num_proc=8):   0%|          | 0/128 [00:00<?, ? examples/s]

Saving unlabeled images (num_proc=8):   0%|          | 0/1034 [00:00<?, ? examples/s]

## Make Initial Dataset

In [4]:
import numpy as np

unique_labels = active_learning_dataset["unlabeled"].unique("label_name")
samples = []
n_samples_per_class = 10

for label in unique_labels:
    label_indices = np.where(np.array(active_learning_dataset["unlabeled"]["label_name"]) == label)[0]
    # Sample 10 random indices without replacement
    random_indices = np.random.choice(
        label_indices, size=n_samples_per_class, replace=False
    )
    samples.extend(random_indices)

initial_samples = active_learning_dataset["unlabeled"].select(samples)

# Verify the result (should show 100 rows total, 10 per class)
print(f"Total samples: {len(initial_samples)}")
print("\nSamples per class:")
print(initial_samples.select_columns(["label_name"]).to_pandas().value_counts())

Total samples: 30

Samples per class:
label_name       
angular_leaf_spot    10
bean_rust            10
healthy              10
Name: count, dtype: int64


In [5]:
initial_samples

Dataset({
    features: ['image', 'filepath', 'label_name'],
    num_rows: 30
})

In [6]:
initial_samples[0]

{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=500x500>,
 'filepath': 'data/beans/0/angular_leaf_spot_train.240.jpg',
 'label_name': 'angular_leaf_spot'}

In [7]:
initial_samples = initial_samples.remove_columns(["image"])
df = initial_samples.to_pandas()
df = df.rename(columns={"label_name": "label"})

df

Unnamed: 0,filepath,label
0,data/beans/0/angular_leaf_spot_train.240.jpg,angular_leaf_spot
1,data/beans/0/angular_leaf_spot_train.219.jpg,angular_leaf_spot
2,data/beans/0/angular_leaf_spot_train.239.jpg,angular_leaf_spot
3,data/beans/0/angular_leaf_spot_train.338.jpg,angular_leaf_spot
4,data/beans/0/angular_leaf_spot_train.274.jpg,angular_leaf_spot
5,data/beans/0/angular_leaf_spot_train.35.jpg,angular_leaf_spot
6,data/beans/0/angular_leaf_spot_train.14.jpg,angular_leaf_spot
7,data/beans/0/angular_leaf_spot_train.232.jpg,angular_leaf_spot
8,data/beans/0/angular_leaf_spot_train.319.jpg,angular_leaf_spot
9,data/beans/0/angular_leaf_spot_train.16.jpg,angular_leaf_spot


In [8]:
df.to_parquet("data/beans/initial_samples.parquet")

## Make Evaluation Dataset

In [9]:
active_learning_dataset["evaluation"][0]

{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=500x500>,
 'filepath': 'data/beans/0/angular_leaf_spot_test.0.jpg',
 'label_name': 'angular_leaf_spot'}

In [10]:
eval_samples = active_learning_dataset["evaluation"].remove_columns(["image"])
eval_samples = eval_samples.rename_column("label_name", "label")
eval_samples

Dataset({
    features: ['filepath', 'label'],
    num_rows: 128
})

In [11]:
df = eval_samples.to_pandas()
df

Unnamed: 0,filepath,label
0,data/beans/0/angular_leaf_spot_test.0.jpg,angular_leaf_spot
1,data/beans/0/angular_leaf_spot_test.1.jpg,angular_leaf_spot
2,data/beans/0/angular_leaf_spot_test.10.jpg,angular_leaf_spot
3,data/beans/0/angular_leaf_spot_test.11.jpg,angular_leaf_spot
4,data/beans/0/angular_leaf_spot_test.12.jpg,angular_leaf_spot
...,...,...
123,data/beans/2/healthy_test.5.jpg,healthy
124,data/beans/2/healthy_test.6.jpg,healthy
125,data/beans/2/healthy_test.7.jpg,healthy
126,data/beans/2/healthy_test.8.jpg,healthy


In [12]:
df.to_parquet("data/beans/evaluation_samples.parquet")

## Make Unlabeled Dataset

In [13]:
unlabeled_samples = active_learning_dataset["unlabeled"].remove_columns(["image"])
unlabeled_samples = unlabeled_samples.rename_column("label_name", "label")
df = unlabeled_samples.to_pandas()
df


Unnamed: 0,filepath,label
0,data/beans/0/angular_leaf_spot_train.0.jpg,angular_leaf_spot
1,data/beans/0/angular_leaf_spot_train.1.jpg,angular_leaf_spot
2,data/beans/0/angular_leaf_spot_train.10.jpg,angular_leaf_spot
3,data/beans/0/angular_leaf_spot_train.100.jpg,angular_leaf_spot
4,data/beans/0/angular_leaf_spot_train.101.jpg,angular_leaf_spot
...,...,...
1029,data/beans/2/healthy_train.95.jpg,healthy
1030,data/beans/2/healthy_train.96.jpg,healthy
1031,data/beans/2/healthy_train.97.jpg,healthy
1032,data/beans/2/healthy_train.98.jpg,healthy


In [14]:
initial_filepaths = set(initial_samples['filepath'])
initial_filepaths

{'data/beans/0/angular_leaf_spot_train.14.jpg',
 'data/beans/0/angular_leaf_spot_train.16.jpg',
 'data/beans/0/angular_leaf_spot_train.219.jpg',
 'data/beans/0/angular_leaf_spot_train.232.jpg',
 'data/beans/0/angular_leaf_spot_train.239.jpg',
 'data/beans/0/angular_leaf_spot_train.240.jpg',
 'data/beans/0/angular_leaf_spot_train.274.jpg',
 'data/beans/0/angular_leaf_spot_train.319.jpg',
 'data/beans/0/angular_leaf_spot_train.338.jpg',
 'data/beans/0/angular_leaf_spot_train.35.jpg',
 'data/beans/1/bean_rust_train.134.jpg',
 'data/beans/1/bean_rust_train.178.jpg',
 'data/beans/1/bean_rust_train.273.jpg',
 'data/beans/1/bean_rust_train.281.jpg',
 'data/beans/1/bean_rust_train.310.jpg',
 'data/beans/1/bean_rust_train.324.jpg',
 'data/beans/1/bean_rust_train.341.jpg',
 'data/beans/1/bean_rust_train.54.jpg',
 'data/beans/1/bean_rust_train.56.jpg',
 'data/beans/1/bean_rust_train.88.jpg',
 'data/beans/2/healthy_train.159.jpg',
 'data/beans/2/healthy_train.173.jpg',
 'data/beans/2/healthy_train

In [15]:
# Filter out rows that are in initial_samples
unlabeled_samples = unlabeled_samples.filter(
    lambda x: x['filepath'] not in initial_filepaths
)

unlabeled_samples

Filter:   0%|          | 0/1034 [00:00<?, ? examples/s]

Dataset({
    features: ['filepath', 'label'],
    num_rows: 1004
})

In [16]:
# df = unlabeled_samples.to_pandas()
# df = df.drop(columns=["label"])
# df


In [17]:
df.to_parquet("data/beans/unlabeled_samples.parquet")