# PyTorch: Combining Multiple Shards of Data

The `AISMultiShardStream` class facilitates combining multiple streams of data shards into one iterable dataset. It takes a list of `DataShard` objects as input, each representing a shard stream. When iterated over, it yields combined samples, where each sample is a tuple containing object bytes from each shard stream. This is particularly useful for scenarios where data is stored in separate shards. 


## Setup client and necessary bucket

In [None]:
import os
import io
import shutil
from pathlib import Path
import tarfile
from aistore.client import Client
from aistore.sdk.dataset.data_shard import DataShard
from aistore.pytorch import AISMultiShardStream

In [None]:
ais_url = os.getenv("AIS_ENDPOINT", "http://localhost:8080")
client = Client(ais_url)
bucket = client.bucket("my-bck").create(exist_ok=True)

## Creating Shards and adding them to our Bucket

In [None]:
# Utility function to create a tar archive from a dictionary of file names and contents
def create_archive(archive_name, content_dict):
    directory = os.path.dirname(archive_name)
    if not os.path.exists(directory):
        os.makedirs(directory)

    with tarfile.open(archive_name, "w") as tar:
        for file_name, file_content in content_dict.items():
            info = tarfile.TarInfo(name=file_name)
            info.size = len(file_content)
            tar.addfile(tarinfo=info, fileobj=io.BytesIO(file_content))

In [None]:
base_path = Path().absolute().joinpath("multishard_example")

Now we will prepare two shards, each containing different types of files: one for ***text data (text_shard.tar)*** and the other for ***class labels (class_shard.tar)***. Each shard is a compressed archive containing multiple files. These shards will be combined later using `AISMultiShardStream`, enabling us to process both text and class data simultaneously as a single stream.

In [None]:
text_shard_content_dict = {
    "file1.txt": b"Content of file one",
    "file2.txt": b"Content of file two",
    "file3.txt": b"Content of file three",
    "file4.txt": b"Content of file four",
    "file5.txt": b"Content of file five",
}
text_shard_archive_name = "text_shard.tar"
text_shard_archive_path = base_path.joinpath(text_shard_archive_name)
create_archive(text_shard_archive_path, text_shard_content_dict)
text_shard_obj = bucket.object(obj_name=text_shard_archive_name)
text_shard_obj.put_file(text_shard_archive_path)

# Create a DataShard object for the text shard
shard1 = DataShard(
    client_url=ais_url,
    bucket_name="my-bck",
    prefix="text_shard.tar",
)

In [None]:
class_shard_content_dict = {
    "file1.cls": b"1",
    "file2.cls": b"2",
    "file3.cls": b"3",
    "file4.cls": b"4",
    "file5.cls": b"5",
}
class_shard_archive_name = "class_shard.tar"
class_shard_archive_path = base_path.joinpath(class_shard_archive_name)
create_archive(class_shard_archive_path, class_shard_content_dict)
class_shard_obj = bucket.object(obj_name=class_shard_archive_name)
class_shard_obj.put_file(class_shard_archive_path)

# Create a DataShard object for the class shard
shard2 = DataShard(
    client_url=ais_url,
    bucket_name="my-bck",
    prefix="class_shard.tar",
)

## Retriveing both shards in a single Stream

In [None]:
dataset = AISMultiShardStream(data_sources=[shard1, shard2])

for data in dataset:
    text_content, class_content = data
    print(f"Text: {text_content}, Class: {class_content}")

## Cleanup

In [None]:
# Remove the shards from local disk
try:
    shutil.rmtree(str(base_path))
except FileNotFoundError:
    pass

In [None]:
bucket.delete()