In [1]:
import numpy as np
from numpy.typing import NDArray
import atdata
from atdata.local import LocalDatasetEntry, S3DataStore, Index
import webdataset as wds

In [2]:
@atdata.packable
class TrainingSample:
    """A sample containing features and label for training."""
    features: NDArray
    label: int

from dataclasses import dataclass

@atdata.packable
class TextSample( atdata.PackableSample ):
    """A sample containing text data."""
    text: str
    category: str

In [3]:
x = TextSample(
    text = 'Hello',
    category = 'test',
)

In [4]:
x.packed

b'\x82\xa4text\xa5Hello\xa8category\xa4test'

---

In [5]:
# Connect to S3
store = S3DataStore( '.credentials/r2-analysis-hive.env',
    bucket = "analysis-hive"
)
print(f"Bucket: {store.bucket}")
print(f"Supports streaming: {store.supports_streaming()}")

# Connect to Redis
index = Index(
    data_store = store,
    auto_stubs = True,
)
print( "LocalIndex connected" )

Bucket: analysis-hive
Supports streaming: True
LocalIndex connected


In [6]:
list( index.list_schemas() )

[{'name': 'TrainingSample',
  'version': '1.0.0',
  'fields': [{'name': 'features',
    'fieldType': {'$type': 'local#ndarray', 'dtype': 'float32'},
    'optional': False},
   {'name': 'label',
    'fieldType': {'$type': 'local#primitive', 'primitive': 'int'},
    'optional': False}],
  '$ref': 'atdata://local/sampleSchema/TrainingSample@1.0.0',
  'description': 'A sample containing features and label for training.',
  'createdAt': '2026-01-28T21:47:19.019250+00:00'},
 {'name': 'TextSample',
  'version': '1.0.1',
  'fields': [{'name': 'text',
    'fieldType': {'$type': 'local#primitive', 'primitive': 'str'},
    'optional': False},
   {'name': 'category',
    'fieldType': {'$type': 'local#primitive', 'primitive': 'str'},
    'optional': False}],
  '$ref': 'atdata://local/sampleSchema/TextSample@1.0.1',
  'description': 'A sample containing text data.',
  'createdAt': '2026-01-28T19:02:34.955392+00:00'},
 {'name': 'TextSample',
  'version': '1.1',
  'fields': [{'name': 'text',
    'fiel

In [7]:
s = next( index.schemas )

In [8]:
s.ref

'atdata://local/sampleSchema/TrainingSample@1.0.0'

In [9]:
# Publish a schema
schema_ref = index.publish_schema( TrainingSample, version="1.0.0")
print(f"Published schema: {schema_ref}")

# List all schemas
for schema in index.list_schemas():
    print(f"  - {schema.get('name', 'Unknown')} v{schema.get('version', '?')}")

# Get schema record
schema_record = index.get_schema(schema_ref)
print(f"Schema fields: {[f['name'] for f in schema_record.get('fields', [])]}")

# Decode schema back to a PackableSample class
decoded_type = index.decode_schema(schema_ref)
print(f"Decoded type: {decoded_type.__name__}")

Published schema: atdata://local/sampleSchema/TrainingSample@1.0.0
  - TrainingSample v1.0.0
  - TextSample v1.0.1
  - TextSample v1.1
  - TrainingSample v1.1
Schema fields: ['features', 'label']
Decoded type: TrainingSample


In [10]:
# Publish a schema
schema_ref_2 = index.publish_schema(TextSample, version="1.0.1")
print(f"Published schema: {schema_ref_2}")

# List all schemas
for schema in index.list_schemas():
    print(f"  - {schema.get('name', 'Unknown')} v{schema.get('version', '?')}")

# Get schema record
schema_record = index.get_schema(schema_ref_2)
print(f"Schema fields: {[f['name'] for f in schema_record.get('fields', [])]}")

# Decode schema back to a PackableSample class
decoded_type = index.decode_schema(schema_ref_2)
print(f"Decoded type: {decoded_type.__name__}")

Published schema: atdata://local/sampleSchema/TextSample@1.0.1
  - TrainingSample v1.0.0
  - TextSample v1.0.1
  - TextSample v1.1
  - TrainingSample v1.1
Schema fields: ['text', 'category']
Decoded type: TextSample


In [11]:
from typing import TypeVar, TypeAlias, Generic, Callable, Any

S = TypeVar( 'S', bound = atdata.PackableSample )
V = TypeVar( 'V', bound = atdata.PackableSample )

FromAnyTo = Callable[[Any], V]

def make_local_lens( f: FromAnyTo[V], remote: type[S], local: type[V] ) -> atdata.Lens[S, V]:
    """TODO"""
    @atdata.lens
    def _to_local( s: S ) -> V:
        return f( s )
    return _to_local

In [12]:
index.load_schema( 'atdata://local/sampleSchema/TextSample@1.1' )
TextSampleRemote = index.types.TextSample


[atdata] Generated schema module in: /Users/max/.atdata/stubs
[atdata] For IDE support, add this path to your type checker:
[atdata]   VS Code/Pylance: Add to python.analysis.extraPaths
[atdata]   PyCharm: Mark as Sources Root
[atdata]   mypy: Add to mypy_path in mypy.ini



In [13]:
x = TextSampleRemote(
    text = 'hello',
    category = 'test',
)

In [14]:
def _to_text_sample( s: Any ) -> TextSample:
    return TextSample(
        text = s.text,
        category = s.category,
    )

l = make_local_lens( _to_text_sample, TextSampleRemote, TextSample )

In [15]:
y = l( x )

---

In [17]:
import webdataset as wds
from uuid import uuid4

data_pattern = 'data/TextSample_test-%06d.tar'

with wds.writer.ShardWriter( data_pattern, maxcount = 1_000 ) as sink:
    for i in range( 10_000 ):
        new_sample = TextSample(
            text = str( uuid4() ),
            category = 'test',
        )
        sink.write( new_sample.as_wds )

# writing data/TextSample_test-000000.tar 0 0.0 GB 0
# writing data/TextSample_test-000001.tar 1000 0.0 GB 1000
# writing data/TextSample_test-000002.tar 1000 0.0 GB 2000
# writing data/TextSample_test-000003.tar 1000 0.0 GB 3000
# writing data/TextSample_test-000004.tar 1000 0.0 GB 4000
# writing data/TextSample_test-000005.tar 1000 0.0 GB 5000
# writing data/TextSample_test-000006.tar 1000 0.0 GB 6000
# writing data/TextSample_test-000007.tar 1000 0.0 GB 7000
# writing data/TextSample_test-000008.tar 1000 0.0 GB 8000
# writing data/TextSample_test-000009.tar 1000 0.0 GB 9000


In [18]:
from atdata import load_dataset

ds = (
    load_dataset( 'data/TextSample_test-{000000..000009}.tar',
        split = 'test'
    )
    .as_type( TextSample )
)

In [19]:
x = next( iter( ds.ordered() ) )

In [20]:
x

TextSample(text='7298401e-7183-4bf6-8f9f-feeab0145d88', category='test')

In [21]:
entry = index.insert_dataset( ds, 
    name = 'proto-text-samples-10',
    prefix = 'prototyping',
    schema_ref = 'atdata://local/sampleSchema/TextSample@1.1',
)

# writing analysis-hive/prototyping/data--97e6a2ab-f3ee-4f31-967f-2c0b1bc8f1e1--000000.tar 0 0.0 GB 0


In [22]:
entry

LocalDatasetEntry(name='proto-text-samples-10', schema_ref='atdata://local/sampleSchema/TextSample@1.1', data_urls=['s3://analysis-hive/prototyping/data--97e6a2ab-f3ee-4f31-967f-2c0b1bc8f1e1--000000.tar'], metadata=None)

Notes:

* We should make sure that the `s3` URI-scheme here is properly used
    * Should we be using the `https` URI since actually this is doing data streaming with `wds`? Or does this indicate that we should think more deeply about the `Dataset` API design and generalizing how we're setting up the `wds` data streaming ...
    * No matter what, we're definitely going to want to make sure that we incorporate the actual host details of the `LocalIndex`'s `S3DataStore` for this, since the S3 host is definitely not local.
    * Should there be underscores here? These feel like public properties ...

---

In [26]:
from atdata import load_dataset

# Load from local index
ds = load_dataset( "@local/proto-text-samples-10", TextSample,
    split = 'train',
    #
    index = index,
)

# The index resolves the dataset name to URLs and schema
for batch in ds.shuffled( batch_size = 32 ):
    break

Notes:

* This is also getting linting errors on `load_dataset` that there are no matching overloads.

In [32]:
batch.text

['490364ef-cbb3-4ae6-9d1e-e77b23d93179',
 '9d8eada3-c8be-4bf2-bf30-b05cb732881e',
 '12d7f7c4-142b-45a1-8094-2358ec9e95d0',
 'fc8fb9a3-4aa0-4446-a90e-7575840cf28b',
 '7cef4d2b-2775-4118-b420-c57f098eb6fc',
 '1b98a97b-c6bf-40a9-b2a8-e94f9384b632',
 '028f4549-096e-46c1-8d01-286086d2edc5',
 'ba3858be-da4b-47c1-be68-644edf0bf354',
 '69e9fc0f-7047-4aca-aac5-d5cecbfc6d44',
 '8147fd45-b1cd-4945-b021-e97666f54dbd',
 'a1ba76bc-71fb-49af-87e2-ff62b850224b',
 '84e58647-fd44-4fd7-9feb-dc6bfde2aef3',
 'f923b9bc-b999-4ae2-b21d-b407d1fc7e32',
 '29c3055b-73fb-48b9-a662-26ea1f697f0c',
 '4e624f21-785f-4f87-87c4-371598a93e79',
 'a285cbaa-7a5c-4518-a5a6-f4a1f70b4fa8',
 'd9a8973b-8430-420b-9b93-0ddde1e6fb1f',
 '59eb6822-572c-4597-b929-a7dadb06770a',
 'd300982a-0d4e-41e4-a849-b27fb93e5333',
 '53f0edee-e060-49fb-bcc9-4fa29bf75747',
 '3a301280-3fdf-49a3-aad1-8285b9182e74',
 'a9d8afac-84b8-4ef0-992c-7611980dda9a',
 '1d06edc6-1276-4460-aed8-d6974878fd1f',
 '45642c2e-d9c2-499e-9d2e-a89b6b77dc42',
 'f035dddd-7c4e-

In [36]:
ds.url

's3://analysis-hive/prototyping/data--4a5ff662-803b-4700-81f4-45f288f6e565--000000.tar'

Notes:

* We're getting linting errors because of the protocol use for `AbstractIndex`; better to subclass, or is there a way for this to get the protocol adherence?
* The S3 URI error is showing up here now because of how dataset loading works! The data is uploaded correctly on my end, but it can't be accessed because of this URI not being the correct way to access the data for `wds` streaming over `https`; we should think of how best to encode this!