# Shard Demo: Processing and Analyzing Data with DeltaCat

This example showcases how DeltaCat simplifies dataset management, enabling seamless integration with PyTorch and distributed frameworks like Ray for scalable data processing.

**Steps in the Demo:**
- **Generate Data:** Create a Parquet file with fake contact information, including `id`, `name`, `age`, and `score`.
- **Load Dataset:** Use DeltaCat to load the data from the Parquet file into a `Dataset`.
- **Shard Data:** Split the dataset into multiple shards for parallel processing.
- **Distributed Processing:** Use Ray to process each shard in parallel, calculating total age and count for each shard.
- **Aggregate Results:** Combine results from all shards to compute the overall average age.

In [None]:
from typing import Tuple, Generator, List
import deltacat as dc
import pathlib
import pyarrow as pa
import pyarrow.parquet as pq
import torch
import ray
from faker import Faker
import random
import string

ray.init()

In [None]:
parquet_file_path = pathlib.Path.cwd() / "data.parquet"

# for string id try changing char_pool to string.ascii_letters
# this method of generation isn't consistent and you may or may not see a uniform distribution.
def generate_random_id(length=10, char_pool=string.digits):
    return ''.join(random.choices(char_pool, k=length))

fake = Faker()
num_records = 10000

# Try playing around with the different ID generators below.
# You should observe a difference in the distribution of id's over shards.
data = {
    "id": range(1, num_records + 1),
    # "id": [generate_random_id(char_pool=string.ascii_letters) for _ in range(num_records)],
    # "id": [fake.uuid4() for _ in range(num_records)],
    "name": [fake.name() for _ in range(num_records)],
    "age": [random.randint(18, 80) for _ in range(num_records)],
    "score": [random.randint(0, 100) for _ in range(num_records)]
}

table = pa.Table.from_pydict(data)
pq.write_table(table, parquet_file_path)
print(f"Created Parquet file at: {parquet_file_path}")

In [None]:
dataset = dc.Dataset.from_parquet(
    name="data",
    file_uri=parquet_file_path,
    metadata_uri=".",
    merge_keys="id"
)
print("Loaded dataset from Parquet file.")
dataset.print(num_records=10)

In [None]:
shards = dataset.shards(num_shards=10)
print(len(shards))
print(shards)

In [None]:
@ray.remote
def process_shard(shard: Generator[torch.Tensor, None, None], fields: List[str]) -> Tuple[float, int]:
    tensor_generator = dataset.scan(shard=shard, fields=fields).to_tensor()
    total_age = 0.0
    count = 0

    # calculate total
    for tensor in tensor_generator:
        ages = tensor[:, 0]
        total_age += ages.sum().item()
        count += 1

    return total_age, count

# you can pick fields to include here.
# fields must be numerical since we want to generate tensors.
fields_to_include = ["age"]
futures = [process_shard.remote(shard, fields=fields_to_include) for shard in shards]
results = ray.get(futures)


In [None]:
# Aggregate results from all shards
total_age = 0.0
total_count = 0

for idx, result in enumerate(results):
    shard_total_age, shard_count = result
    print(f"[Shard {idx}] total count: {shard_count}, total age: {shard_total_age}")
    total_age += shard_total_age
    total_count += shard_count

if total_count > 0:
    average_age = total_age / total_count
    print(f"Average age across all shards: {average_age}")
else:
    print("No data available to compute average age.")