# CIFAR-10 Dataset Handling with Atria

## Setup and Auto-reloading Modules
We enable auto-reloading of modules so that any changes in imported libraries are automatically reflected.

In [1]:
%load_ext autoreload
%autoreload 2

## Importing Dependencies
Here, we modify the system path to include the project's root directory and import necessary modules for dataset handling.

## Loading the CIFAR-10 Dataset
We load the CIFAR-10 dataset using the `CIFAR10.load` method, specifying the training split.

In [3]:
from atria_core.utilities.imports import _get_package_base_path

from atria_datasets import AtriaImageDataset, FileStorageType

package_path = _get_package_base_path("atria")
dataset = AtriaImageDataset.load_from_registry(
    name="cifar10",
    provider="atria_datasets",
    build_kwargs ={
        "max_train_samples": 1000,
        "max_test_samples": 1000,
        "max_validation_samples": 1000,
    }
)
dataset.train.dataframe()


[2025-07-11 12:21:30][atria_datasets.core.dataset.atria_dataset][INFO] Loading dataset cifar10 from registry.


[2025-07-11 12:21:30][atria_datasets.core.dataset.atria_dataset][INFO] Caching dataset to storage dir: /mnt/hephaistos/.atria/datasets/cifar10/main
[2025-07-11 12:21:30][atria_datasets.core.dataset.atria_dataset][INFO] Loading dataset split train from cached storage: /mnt/hephaistos/.atria/datasets/cifar10/main/delta/train
[2025-07-11 12:21:30][atria_datasets.core.dataset.atria_dataset][INFO] Loading dataset split test from cached storage: /mnt/hephaistos/.atria/datasets/cifar10/main/delta/test


Unnamed: 0,index,sample_id,image_file_path,image_content,image_width,image_height,gt_classification,gt_ser,gt_ocr,gt_qa,gt_vqa,gt_layout
0,0,3576b4e0-8e49-4cba-9706-25961865ad78,,b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\...,32,32,"{""label"": {""value"": 6, ""name"": ""frog""}}",,,,,
1,1,994c31a7-e664-4772-af41-f69f890dfeb4,,b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\...,32,32,"{""label"": {""value"": 9, ""name"": ""truck""}}",,,,,
2,2,a5c7c216-4f9a-4df7-9ae2-d4c13bd2f186,,b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\...,32,32,"{""label"": {""value"": 9, ""name"": ""truck""}}",,,,,
3,3,a542f5ca-98cb-4000-b125-a2721cecb714,,b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\...,32,32,"{""label"": {""value"": 4, ""name"": ""deer""}}",,,,,
4,4,bdaa5a1e-4e4e-4a7a-b4b8-4f0051178244,,b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\...,32,32,"{""label"": {""value"": 1, ""name"": ""automobile""}}",,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
995,995,dbe47cdd-e126-4b50-b608-015460cdc310,,b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\...,32,32,"{""label"": {""value"": 3, ""name"": ""cat""}}",,,,,
996,996,cfeacf75-1c46-4a0b-ac2b-2c8d5740a7ab,,b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\...,32,32,"{""label"": {""value"": 5, ""name"": ""dog""}}",,,,,
997,997,8ce04f7c-8195-40e8-b2c7-38d6e8c20f25,,b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\...,32,32,"{""label"": {""value"": 1, ""name"": ""automobile""}}",,,,,
998,998,78a83987-f2d6-4fd0-ac00-53628da5c239,,b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\...,32,32,"{""label"": {""value"": 3, ""name"": ""cat""}}",,,,,


## Creating batched instances from a list of samples
We create a list of samples and then call batched on the list which is the class method of the specific instance

In [6]:
# Make a list of instances
instances = [
    dataset.train[i].to_tensor() for i in range(2)
]

# Batch the instances
batched = instances[0].batched(instances)

# Display the batched instances
print(batched.image.content[0] - dataset.train[0].to_tensor().image.content)
print(batched.image.content[1] - dataset.train[1].to_tensor().image.content)



tensor([[[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]]])
tensor([[[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 

## Dataset handling with File Storage
Load the dataset with a file storage manager that first caches the data into disk

In [None]:
from atria.data.storage.file_storage_manager import FileStorageManager
from atria.data.storage.utilities import FileStorageType

# Creat a file storage manager
file_storage_manager = FileStorageManager(
    storage_dir="/tmp", streaming_mode=False, storage_type=FileStorageType.MSGPACK, 
    max_samples=100, # save up to 100 samples
)

TypeError: FileStorageManager.__init__() got an unexpected keyword argument 'storage_dir'

In [None]:
from atria_core.types import DatasetSplitType
from atria_examples.datasets.cifar10 import Cifar10

# load the dataset with the file storage manager
cifar10 = Cifar10.load(
    split=DatasetSplitType.train,
    storage_manager=file_storage_manager,
)

AttributeError: type object 'Cifar10' has no attribute 'load'

In [None]:
# Extract a sample instance from the dataset
cifar10[0]

ImageInstance(
    index=0,
    id=UUID('629095eb-cad3-4e9d-8b27-6e989ab27bba'),
    image=Image(
        file_path=None,
        content=<PIL.PngImagePlugin.PngImageFile image mode=RGB size=32x32 at 0x7760B0786900>,
        source_size=None,
        shape=(3, 32, 32),
        dtype=None
    ),
    label=Label(value=6, name='frog')
)