In [1]:
from datasets import load_dataset

dataset = load_dataset("timm/oxford-iiit-pet")


In [2]:
dataset

DatasetDict({
    train: Dataset({
        features: ['image', 'label', 'image_id', 'label_cat_dog'],
        num_rows: 3680
    })
    test: Dataset({
        features: ['image', 'label', 'image_id', 'label_cat_dog'],
        num_rows: 3669
    })
})

## Save the dataset locally

In [3]:
from pathlib import Path
from loguru import logger
from datasets import DatasetDict
from PIL import Image

SAVE_DIR = Path("data/oxford-iiit-pet")
NUM_PROC = 8  
CLASS_NAMES = dataset['train'].features['label'].names

def save_image(example, idx):
    """Save a dataset image to disk with error handling and add label name.
    
    Args:
        example (dict): Dataset example containing 'image' and 'label'
        idx (int): Index of the example
        
    Returns:
        dict: Dictionary containing the saved filepath and label_name
    """
    try:
        image = example['image']
        label_id = example.get('label')  # Get label if it exists 
        image_id = example.get('image_id')
        
        # Convert RGBA to RGB if necessary
        if image.mode == 'RGBA':
            # Create a white background
            background = Image.new('RGB', image.size, (255, 255, 255))
            # Paste the image using alpha channel as mask
            background.paste(image, mask=image.split()[3])
            image = background
        
        # Get label name if label exists
        label_name = CLASS_NAMES[label_id] if label_id is not None else None
        
        # Create directory structure
        label_dir = SAVE_DIR / str(label_id if label_id is not None else 'unlabeled')
        label_dir.mkdir(parents=True, exist_ok=True)
        
        # Create filename with label subdirectory
        filepath = label_dir / f"{image_id}.jpg"
        
        # Save with quality optimization
        image.save(filepath, "JPEG", quality=95, optimize=True)
        
        return {
            "filepath": str(filepath),
            "label_name": label_name
        }
    
    except Exception as e:
        logger.error(f"Error saving image {idx}: {str(e)}")
        return {
            "filepath": None,
            "label_name": None
        }

active_learning_dataset = DatasetDict()

active_learning_dataset['evaluation'] = dataset['test'].map(
    save_image,
    with_indices=True,
    num_proc=NUM_PROC,
    desc="Saving evaluation images",
    remove_columns="label"
)

active_learning_dataset['unlabeled'] = dataset['train'].map(
    save_image,
    with_indices=True,
    num_proc=NUM_PROC,
    desc="Saving unlabeled images",
    remove_columns="label"
)



Saving evaluation images (num_proc=8):   0%|          | 0/3669 [00:00<?, ? examples/s]

Saving unlabeled images (num_proc=8):   0%|          | 0/3680 [00:00<?, ? examples/s]

In [4]:
active_learning_dataset

DatasetDict({
    evaluation: Dataset({
        features: ['image', 'image_id', 'label_cat_dog', 'filepath', 'label_name'],
        num_rows: 3669
    })
    unlabeled: Dataset({
        features: ['image', 'image_id', 'label_cat_dog', 'filepath', 'label_name'],
        num_rows: 3680
    })
})

## Make Initial Samples

In [5]:
import numpy as np

unique_labels = active_learning_dataset["unlabeled"].unique("label_name")
samples = []
n_samples_per_class = 10

for label in unique_labels:
    label_indices = np.where(np.array(active_learning_dataset["unlabeled"]["label_name"]) == label)[0]
    # Sample 10 random indices without replacement
    random_indices = np.random.choice(
        label_indices, size=n_samples_per_class, replace=False
    )
    samples.extend(random_indices)

initial_samples = active_learning_dataset["unlabeled"].select(samples)

# Verify the result (should show 100 rows total, 10 per class)
print(f"Total samples: {len(initial_samples)}")
print("\nSamples per class:")
print(initial_samples.select_columns(["label_name"]).to_pandas().value_counts())

Total samples: 370

Samples per class:
label_name                
abyssinian                    10
american_bulldog              10
american_pit_bull_terrier     10
basset_hound                  10
beagle                        10
bengal                        10
birman                        10
bombay                        10
boxer                         10
british_shorthair             10
chihuahua                     10
egyptian_mau                  10
english_cocker_spaniel        10
english_setter                10
german_shorthaired            10
great_pyrenees                10
havanese                      10
japanese_chin                 10
keeshond                      10
leonberger                    10
maine_coon                    10
miniature_pinscher            10
newfoundland                  10
persian                       10
pomeranian                    10
pug                           10
ragdoll                       10
russian_blue                  10
saint_bern

In [6]:
initial_samples

Dataset({
    features: ['image', 'image_id', 'label_cat_dog', 'filepath', 'label_name'],
    num_rows: 370
})

In [7]:
initial_samples[1]

{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=403x500>,
 'image_id': 'Maine_Coon_136',
 'label_cat_dog': 0,
 'filepath': 'data/oxford-iiit-pet/20/Maine_Coon_136.jpg',
 'label_name': 'maine_coon'}

In [8]:
initial_samples = initial_samples.remove_columns(["image", "image_id", "label_cat_dog"])
df = initial_samples.to_pandas()
df = df.rename(columns={"label_name": "label"})

df

Unnamed: 0,filepath,label
0,data/oxford-iiit-pet/20/Maine_Coon_100.jpg,maine_coon
1,data/oxford-iiit-pet/20/Maine_Coon_136.jpg,maine_coon
2,data/oxford-iiit-pet/20/Maine_Coon_131.jpg,maine_coon
3,data/oxford-iiit-pet/20/Maine_Coon_201.jpg,maine_coon
4,data/oxford-iiit-pet/20/Maine_Coon_171.jpg,maine_coon
...,...,...
365,data/oxford-iiit-pet/7/Bombay_173.jpg,bombay
366,data/oxford-iiit-pet/7/Bombay_178.jpg,bombay
367,data/oxford-iiit-pet/7/Bombay_100.jpg,bombay
368,data/oxford-iiit-pet/7/Bombay_13.jpg,bombay


In [9]:
df.to_parquet("data/oxford-iiit-pet/initial_samples.parquet")

## Make Evaluation Split

In [10]:
active_learning_dataset["evaluation"][0]

{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=500x334>,
 'image_id': 'newfoundland_31',
 'label_cat_dog': 1,
 'filepath': 'data/oxford-iiit-pet/22/newfoundland_31.jpg',
 'label_name': 'newfoundland'}

In [11]:
eval_samples = active_learning_dataset["evaluation"].remove_columns(["image", "image_id", "label_cat_dog"])
eval_samples = eval_samples.rename_column("label_name", "label")
eval_samples

Dataset({
    features: ['filepath', 'label'],
    num_rows: 3669
})

In [12]:
df = eval_samples.to_pandas()
df

Unnamed: 0,filepath,label
0,data/oxford-iiit-pet/22/newfoundland_31.jpg,newfoundland
1,data/oxford-iiit-pet/25/pug_57.jpg,pug
2,data/oxford-iiit-pet/1/american_bulldog_80.jpg,american_bulldog
3,data/oxford-iiit-pet/15/great_pyrenees_22.jpg,great_pyrenees
4,data/oxford-iiit-pet/16/havanese_97.jpg,havanese
...,...,...
3664,data/oxford-iiit-pet/10/chihuahua_91.jpg,chihuahua
3665,data/oxford-iiit-pet/17/japanese_chin_83.jpg,japanese_chin
3666,data/oxford-iiit-pet/28/saint_bernard_88.jpg,saint_bernard
3667,data/oxford-iiit-pet/21/miniature_pinscher_32.jpg,miniature_pinscher


In [13]:
df.to_parquet("data/oxford-iiit-pet/evaluation_samples.parquet")

## Make Unlabeled Split

In [16]:
unlabeled_samples = active_learning_dataset["unlabeled"].remove_columns(["image", "image_id", "label_cat_dog"])
unlabeled_samples = unlabeled_samples.rename_column("label_name", "label")
df = unlabeled_samples.to_pandas()
df


Unnamed: 0,filepath,label
0,data/oxford-iiit-pet/20/Maine_Coon_204.jpg,maine_coon
1,data/oxford-iiit-pet/1/american_bulldog_138.jpg,american_bulldog
2,data/oxford-iiit-pet/18/keeshond_112.jpg,keeshond
3,data/oxford-iiit-pet/16/havanese_157.jpg,havanese
4,data/oxford-iiit-pet/14/german_shorthaired_132...,german_shorthaired
...,...,...
3675,data/oxford-iiit-pet/14/german_shorthaired_138...,german_shorthaired
3676,data/oxford-iiit-pet/26/Ragdoll_169.jpg,ragdoll
3677,data/oxford-iiit-pet/1/american_bulldog_107.jpg,american_bulldog
3678,data/oxford-iiit-pet/35/wheaten_terrier_159.jpg,wheaten_terrier


Remove the initial samples from the unlabeled samples

In [17]:
initial_filepaths = set(initial_samples['filepath'])
initial_filepaths

{'data/oxford-iiit-pet/0/Abyssinian_125.jpg',
 'data/oxford-iiit-pet/0/Abyssinian_129.jpg',
 'data/oxford-iiit-pet/0/Abyssinian_133.jpg',
 'data/oxford-iiit-pet/0/Abyssinian_141.jpg',
 'data/oxford-iiit-pet/0/Abyssinian_143.jpg',
 'data/oxford-iiit-pet/0/Abyssinian_152.jpg',
 'data/oxford-iiit-pet/0/Abyssinian_160.jpg',
 'data/oxford-iiit-pet/0/Abyssinian_161.jpg',
 'data/oxford-iiit-pet/0/Abyssinian_17.jpg',
 'data/oxford-iiit-pet/0/Abyssinian_18.jpg',
 'data/oxford-iiit-pet/1/american_bulldog_103.jpg',
 'data/oxford-iiit-pet/1/american_bulldog_107.jpg',
 'data/oxford-iiit-pet/1/american_bulldog_117.jpg',
 'data/oxford-iiit-pet/1/american_bulldog_118.jpg',
 'data/oxford-iiit-pet/1/american_bulldog_12.jpg',
 'data/oxford-iiit-pet/1/american_bulldog_126.jpg',
 'data/oxford-iiit-pet/1/american_bulldog_140.jpg',
 'data/oxford-iiit-pet/1/american_bulldog_16.jpg',
 'data/oxford-iiit-pet/1/american_bulldog_179.jpg',
 'data/oxford-iiit-pet/1/american_bulldog_19.jpg',
 'data/oxford-iiit-pet/10

In [18]:
# Filter out rows that are in initial_samples
unlabeled_samples = unlabeled_samples.filter(
    lambda x: x['filepath'] not in initial_filepaths
)

unlabeled_samples

Filter:   0%|          | 0/3680 [00:00<?, ? examples/s]

Dataset({
    features: ['filepath', 'label'],
    num_rows: 3310
})

In [20]:
df = unlabeled_samples.to_pandas()
df = df.drop(columns=["label"])
df


Unnamed: 0,filepath
0,data/oxford-iiit-pet/20/Maine_Coon_204.jpg
1,data/oxford-iiit-pet/1/american_bulldog_138.jpg
2,data/oxford-iiit-pet/16/havanese_157.jpg
3,data/oxford-iiit-pet/14/german_shorthaired_132...
4,data/oxford-iiit-pet/3/basset_hound_161.jpg
...,...
3305,data/oxford-iiit-pet/13/english_setter_186.jpg
3306,data/oxford-iiit-pet/14/german_shorthaired_138...
3307,data/oxford-iiit-pet/26/Ragdoll_169.jpg
3308,data/oxford-iiit-pet/35/wheaten_terrier_159.jpg


In [21]:
df.to_parquet("data/oxford-iiit-pet/unlabeled_samples.parquet")