In [None]:
import os
from datasets import load_dataset
from PIL import Image
from tqdm import tqdm
from classes import IMAGENET2012_CLASSES

In [None]:
# Load first N training examples from ILSVRC/imagenet-1k on Hugging Face
OUTPUT_DIR = "imagenet_subset"
N = 1000

# Map class name to synset ID
class_name_to_synset = {
    name: synset
    for synset, name in IMAGENET2012_CLASSES.items()
}

# Create output directory with all classes
os.makedirs(f"{OUTPUT_DIR}/train", exist_ok=True)
for synset in IMAGENET2012_CLASSES.keys():
    os.makedirs(f"{OUTPUT_DIR}/train/{synset}", exist_ok=True)

# Load HF dataset in streaming mode
ds = load_dataset(
    "ILSVRC/imagenet-1k",
    split="train",
    streaming=True
)

count = 0
for example in tqdm(ds, total=N):
    img = example["image"]
    label = example["label"]
    class_name = ds.features["label"].int2str(label)

    if class_name not in class_name_to_synset:
        print(f"WARNING: '{class_name}' not found", count)
        continue

    synset = class_name_to_synset[class_name]
    class_dir = f"{OUTPUT_DIR}/train/{synset}"
    os.makedirs(class_dir, exist_ok=True)

    # DINOv2 expected format: <synset>_<index>.jpg
    index = len(os.listdir(class_dir)) + 1
    filename = f"{class_dir}/{synset}_{index}.JPEG"
    img.convert("RGB").save(filename, format="JPEG")

    count += 1
    if count >= N:
        break

In [None]:
# Load first N validation examples from ILSVRC/imagenet-1k on Hugging Face
OUTPUT_DIR = "imagenet_subset"
N = 5000

# Map class name to synset ID
class_name_to_synset = {
    name: synset
    for synset, name in IMAGENET2012_CLASSES.items()
}

# Create output directory with all classes
os.makedirs(f"{OUTPUT_DIR}/val", exist_ok=True)
for synset in IMAGENET2012_CLASSES.keys():
    os.makedirs(f"{OUTPUT_DIR}/val/{synset}", exist_ok=True)

# Load HF dataset in streaming mode
ds_val = load_dataset(
    "ILSVRC/imagenet-1k",
    split="validation",
    streaming=True
)

count = 0
for example in tqdm(ds_val, total=N):
    img = example["image"]
    label = example["label"]
    class_name = ds_val.features["label"].int2str(label)

    if class_name not in class_name_to_synset:
        print(f"WARNING: '{class_name}' not found", count)
        continue

    synset = class_name_to_synset[class_name]
    class_dir = f"{OUTPUT_DIR}/val/{synset}"
    os.makedirs(class_dir, exist_ok=True)

    # DINOv2 expected format: ILSVRC2012_val_<index>.JPEG
    index = len(os.listdir(class_dir)) + 1
    filename = f"{class_dir}/ILSVRC2012_val_{index:08d}.JPEG"
    img.convert("RGB").save(filename, format="JPEG")

    count += 1
    if count >= N:
        break

In [None]:
# Add one validation example for each missing class (required by DINOv2 preprocessing)
OUTPUT_DIR = "imagenet_subset"

# Map class name to synset ID
class_name_to_synset = {
    name: synset
    for synset, name in IMAGENET2012_CLASSES.items()
}

# Identify missing val classes
missing = []
for synset in IMAGENET2012_CLASSES.keys():
    class_dir = f"{OUTPUT_DIR}/val/{synset}"
    if not os.path.isdir(class_dir) or len(os.listdir(class_dir)) == 0:
        missing.append(synset)

print("Missing val classes:", missing)

# Load HF dataset in streaming mode
ds_val = load_dataset(
    "ILSVRC/imagenet-1k",
    split="validation",
    streaming=True
)

count = -1
for example in ds_val:
    img = example["image"]
    label = example["label"]
    class_name = ds_val.features["label"].int2str(label)

    if class_name not in class_name_to_synset:
        continue
    count += 1
    synset = class_name_to_synset[class_name]
    if synset not in missing:
        continue

    class_dir = f"{OUTPUT_DIR}/val/{synset}"
    os.makedirs(class_dir, exist_ok=True)

    # DINOv2 expected format: ILSVRC2012_val_<index>.JPEG
    index = len(os.listdir(class_dir)) + 1
    filename = f"{class_dir}/ILSVRC2012_val_{index:08d}.JPEG"

    img.convert("RGB").save(filename, format="JPEG")
    print("Added one image to", filename)

    missing.remove(synset)
    if len(missing) == 0:
        break


Missing val classes: ['n01751748', 'n02098286', 'n02123159', 'n04086273', 'n07565083']
Added one image to imagenet_subset/val/n04086273/ILSVRC2012_val_00000001.JPEG
Added one image to imagenet_subset/val/n01751748/ILSVRC2012_val_00000001.JPEG
Added one image to imagenet_subset/val/n02123159/ILSVRC2012_val_00000001.JPEG
Added one image to imagenet_subset/val/n07565083/ILSVRC2012_val_00000001.JPEG
Added one image to imagenet_subset/val/n02098286/ILSVRC2012_val_00000001.JPEG


In [None]:
# Load first N test examples from ILSVRC/imagenet-1k on Hugging Face
OUTPUT_DIR = "imagenet_subset"
N = 5000

# Create output directory for test (no class subfolders)
os.makedirs(f"{OUTPUT_DIR}/test", exist_ok=True)

# Load HF dataset in streaming mode
ds_test = load_dataset(
    "ILSVRC/imagenet-1k",
    split="test",
    streaming=True
)

count = 0
for example in tqdm(ds_test, total=N):
    img = example["image"]

    # DINOv2 expected format: ILSVRC2012_test_<index>.JPEG
    filename = f"{OUTPUT_DIR}/test/ILSVRC2012_test_{(count + 1):08d}.JPEG"
    img.convert("RGB").save(filename, format="JPEG")

    count += 1
    if count >= N:
        break