In [1]:
from datasets import load_dataset

dataset = load_dataset("sasha/dog-food")


README.md:   0%|          | 0.00/4.37k [00:00<?, ?B/s]

dataset_infos.json:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

(…)-00000-of-00001-9bf5abf8b080cbba.parquet:   0%|          | 0.00/199M [00:00<?, ?B/s]

(…)-00000-of-00001-6ea6ccdcc8fa38d5.parquet:   0%|          | 0.00/85.8M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2100 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/900 [00:00<?, ? examples/s]

In [2]:
dataset

DatasetDict({
    train: Dataset({
        features: ['image', 'label'],
        num_rows: 2100
    })
    test: Dataset({
        features: ['image', 'label'],
        num_rows: 900
    })
})

## Save the dataset locally

In [4]:
from pathlib import Path
from loguru import logger
from datasets import DatasetDict

SAVE_DIR = Path("data/dog-food")
NUM_PROC = 8  
CLASS_NAMES = dataset['train'].features['label'].names

def save_image(example, idx):
    """Save a dataset image to disk with error handling and add label name.
    
    Args:
        example (dict): Dataset example containing 'image' and 'label'
        idx (int): Index of the example
        
    Returns:
        dict: Dictionary containing the saved filepath and label_name
    """
    try:
        image = example['image']
        label_id = example.get('label')  # Get label if it exists (won't exist for unlabeled)
        
        # Get label name if label exists
        label_name = CLASS_NAMES[label_id] if label_id is not None else None
        
        # Create directory structure
        label_dir = SAVE_DIR / str(label_id if label_id is not None else 'unlabeled')
        label_dir.mkdir(parents=True, exist_ok=True)
        
        # Create filename with label subdirectory
        filepath = label_dir / f"{idx:05d}.jpg"
        
        # Save with quality optimization
        image.save(filepath, "JPEG", quality=95, optimize=True)
        
        return {
            "filepath": str(filepath),
            "label_name": label_name
        }
    
    except Exception as e:
        logger.error(f"Error saving image {idx}: {str(e)}")
        return {
            "filepath": None,
            "label_name": None
        }

active_learning_dataset = DatasetDict()

active_learning_dataset['evaluation'] = dataset['test'].map(
    save_image,
    with_indices=True,
    num_proc=NUM_PROC,
    desc="Saving evaluation images",
    remove_columns="label"
)

active_learning_dataset['unlabeled'] = dataset['train'].map(
    save_image,
    with_indices=True,
    num_proc=NUM_PROC,
    desc="Saving unlabeled images",
    remove_columns="label"
)



Saving evaluation images (num_proc=8):   0%|          | 0/900 [00:00<?, ? examples/s]

Saving unlabeled images (num_proc=8):   0%|          | 0/2100 [00:00<?, ? examples/s]

In [5]:
active_learning_dataset

DatasetDict({
    evaluation: Dataset({
        features: ['image', 'filepath', 'label_name'],
        num_rows: 900
    })
    unlabeled: Dataset({
        features: ['image', 'filepath', 'label_name'],
        num_rows: 2100
    })
})

## Make Initial Samples

In [8]:
import numpy as np

unique_labels = active_learning_dataset["unlabeled"].unique("label_name")
samples = []
n_samples_per_class = 10

for label in unique_labels:
    label_indices = np.where(np.array(active_learning_dataset["unlabeled"]["label_name"]) == label)[0]
    # Sample 10 random indices without replacement
    random_indices = np.random.choice(
        label_indices, size=n_samples_per_class, replace=False
    )
    samples.extend(random_indices)

initial_samples = active_learning_dataset["unlabeled"].select(samples)

# Verify the result (should show 100 rows total, 10 per class)
print(f"Total samples: {len(initial_samples)}")
print("\nSamples per class:")
print(initial_samples.select_columns(["label_name"]).to_pandas().value_counts())

Total samples: 20

Samples per class:
label_name
dog           10
food          10
Name: count, dtype: int64


In [9]:
initial_samples

Dataset({
    features: ['image', 'filepath', 'label_name'],
    num_rows: 20
})

In [10]:
initial_samples = initial_samples.remove_columns("image")
df = initial_samples.to_pandas()
df = df.rename(columns={"label_name": "label"})
df

Unnamed: 0,filepath,label
0,data/dog-food/0/00376.jpg,dog
1,data/dog-food/0/00566.jpg,dog
2,data/dog-food/0/00142.jpg,dog
3,data/dog-food/0/00206.jpg,dog
4,data/dog-food/0/00101.jpg,dog
5,data/dog-food/0/00364.jpg,dog
6,data/dog-food/0/00529.jpg,dog
7,data/dog-food/0/00271.jpg,dog
8,data/dog-food/0/00588.jpg,dog
9,data/dog-food/0/00093.jpg,dog


In [11]:
df.to_parquet("data/dog-food/initial_samples.parquet")

## Make Evaluation Split

In [13]:
eval_samples = active_learning_dataset["evaluation"].remove_columns("image")
eval_samples = eval_samples.rename_column("label_name", "label")
eval_samples

Dataset({
    features: ['filepath', 'label'],
    num_rows: 900
})

In [14]:
df = eval_samples.to_pandas()
df

Unnamed: 0,filepath,label
0,data/dog-food/0/00000.jpg,dog
1,data/dog-food/0/00001.jpg,dog
2,data/dog-food/0/00002.jpg,dog
3,data/dog-food/0/00003.jpg,dog
4,data/dog-food/0/00004.jpg,dog
...,...,...
895,data/dog-food/1/00895.jpg,food
896,data/dog-food/1/00896.jpg,food
897,data/dog-food/1/00897.jpg,food
898,data/dog-food/1/00898.jpg,food


In [15]:
df.to_parquet("data/dog-food/evaluation_samples.parquet")

## Make Unlabeled Split

In [16]:
unlabeled_samples = active_learning_dataset["unlabeled"].remove_columns("image")
unlabeled_samples = unlabeled_samples.rename_column("label_name", "label")

df = unlabeled_samples.to_pandas()
df


Unnamed: 0,filepath,label
0,data/dog-food/0/00000.jpg,dog
1,data/dog-food/0/00001.jpg,dog
2,data/dog-food/0/00002.jpg,dog
3,data/dog-food/0/00003.jpg,dog
4,data/dog-food/0/00004.jpg,dog
...,...,...
2095,data/dog-food/1/02095.jpg,food
2096,data/dog-food/1/02096.jpg,food
2097,data/dog-food/1/02097.jpg,food
2098,data/dog-food/1/02098.jpg,food


Remove the initial samples from the unlabeled samples

In [17]:
initial_filepaths = set(initial_samples['filepath'])
initial_filepaths

{'data/dog-food/0/00093.jpg',
 'data/dog-food/0/00101.jpg',
 'data/dog-food/0/00142.jpg',
 'data/dog-food/0/00206.jpg',
 'data/dog-food/0/00271.jpg',
 'data/dog-food/0/00364.jpg',
 'data/dog-food/0/00376.jpg',
 'data/dog-food/0/00529.jpg',
 'data/dog-food/0/00566.jpg',
 'data/dog-food/0/00588.jpg',
 'data/dog-food/1/00922.jpg',
 'data/dog-food/1/00995.jpg',
 'data/dog-food/1/01032.jpg',
 'data/dog-food/1/01047.jpg',
 'data/dog-food/1/01131.jpg',
 'data/dog-food/1/01159.jpg',
 'data/dog-food/1/01208.jpg',
 'data/dog-food/1/01274.jpg',
 'data/dog-food/1/01588.jpg',
 'data/dog-food/1/01983.jpg'}

In [19]:
# Filter out rows that are in initial_samples
unlabeled_samples = unlabeled_samples.filter(
    lambda x: x['filepath'] not in initial_filepaths
)

unlabeled_samples

Filter:   0%|          | 0/2080 [00:00<?, ? examples/s]

Dataset({
    features: ['filepath', 'label'],
    num_rows: 2080
})

In [20]:
df = unlabeled_samples.to_pandas()
df
df.to_parquet("data/dog-food/unlabeled_samples.parquet")