In [1]:
import ray
import time
import hashlib
from pathlib import Path
from ray.util.queue import Queue

ray.shutdown()
time.sleep(1)

PROJECT_ROOT = "/Users/rajsingh/Desktop/code/music_audio_analyzer"
ray.init(
    runtime_env={
        "working_dir": PROJECT_ROOT,
        "excludes": ["*.mp3", "*.wav", "audio_files/", ".git/", "__pycache__/"],
        "env_vars": {"_CACHE_BUST": str(time.time())},
    }
)

from src.streaming_pipeline import (
    FunctionAgent,
    AgentRayComputeConfig,
    AgentStage,
    QueueStreamingDatasource,
    StreamingDatasourceConfig,
    StreamingPipeline,
)


# Test function - receives audio_bytes directly, NOT an ObjectRef
def test_audio_direct(items):
    from loguru import logger

    results = []
    for item in items:
        logger.info(f"Processing: {item.get('filename', 'unknown')}")

        # audio_bytes is passed directly, not as ObjectRef
        audio_bytes = item.get("audio_bytes")
        if audio_bytes:
            logger.info(f"Got {len(audio_bytes)} bytes directly")
            results.append(
                {
                    "job_id": item["job_id"],
                    "filename": item["filename"],
                    "audio_size": len(audio_bytes),
                    "error": None,
                }
            )
        else:
            logger.error("No audio_bytes!")
            results.append(
                {
                    "job_id": item["job_id"],
                    "filename": item["filename"],
                    "audio_size": 0,
                    "error": "No audio_bytes",
                }
            )

    return results


# Setup with ONE audio file - pass bytes DIRECTLY, not ObjectRef
AUDIO_DIR = Path(PROJECT_ROOT) / "audio_files"
audio_file = list(AUDIO_DIR.glob("*.mp3"))[0]
print(f"Using: {audio_file.name}")

audio_bytes = audio_file.read_bytes()
print(f"Audio size: {len(audio_bytes)} bytes")

job_queue = Queue(maxsize=10)
job_queue.put(
    {
        "job_id": "test_001",
        "filename": audio_file.name,
        "audio_bytes": audio_bytes,  # Pass bytes directly, NOT ray.put()
    }
)

datasource = QueueStreamingDatasource(
    queue=job_queue,
    item_to_row_fn=lambda x: x,
    config=StreamingDatasourceConfig(batch_size=1, batch_timeout=1.0, max_items=1),
)

stage = AgentStage(
    agent=FunctionAgent(process_fn=test_audio_direct),
    config=AgentRayComputeConfig(num_actors=1, batch_size=1),
    name="DirectBytesTest",
)

pipeline = StreamingPipeline(datasource=datasource, stages=[stage], name="DebugTest")

print("Starting pipeline...")
try:
    for batch in pipeline.stream(batch_size=1):
        print(f"SUCCESS! Got result: {batch}")
        break
except Exception as e:
    print(f"ERROR: {e}")
finally:
    pipeline.stop()

2025-12-31 03:50:25,397	INFO worker.py:1998 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8267 [39m[22m
2025-12-31 03:50:25,404	INFO packaging.py:392 -- Ignoring upload to cluster for these files: [PosixPath('/Users/rajsingh/Desktop/code/music_audio_analyzer/.gitignore')]
2025-12-31 03:50:25,449	INFO packaging.py:691 -- Creating a file package for local module '/Users/rajsingh/Desktop/code/music_audio_analyzer'.
2025-12-31 03:50:25,450	INFO packaging.py:392 -- Ignoring upload to cluster for these files: [PosixPath('/Users/rajsingh/Desktop/code/music_audio_analyzer/.gitignore')]
2025-12-31 03:50:25,475	INFO packaging.py:463 -- Pushing file package 'gcs://_ray_pkg_1f023727ce2b90eb.zip' (0.42MiB) to Ray cluster...
2025-12-31 03:50:25,476	INFO packaging.py:476 -- Successfully pushed file package 'gcs://_ray_pkg_1f023727ce2b90eb.zip'.


Using: Red Hot Chili Peppers - Otherside.mp3
Audio size: 8021583 bytes


[32m2025-12-31 03:50:29.321[0m | [1mINFO    [0m | [36msrc.streaming_pipeline.streaming_component[0m:[36mbuild[0m:[36m146[0m - [1mRay Data streaming config: preserve_order=False[0m


Starting pipeline...


[32m2025-12-31 03:50:31.159[0m | [1mINFO    [0m | [36msrc.streaming_pipeline.streaming_component[0m:[36mbuild[0m:[36m155[0m - [1mAdding stage: DirectBytesTest[0m
2025-12-31 03:50:31,167	INFO logging.py:397 -- Registered dataset logger for dataset dataset_1_0
2025-12-31 03:50:31,174	INFO streaming_executor.py:178 -- Starting execution of Dataset dataset_1_0. Full logs are in /tmp/ray/session_2025-12-31_03-50-22_342761_9140/logs/ray-data
2025-12-31 03:50:31,174	INFO streaming_executor.py:179 -- Execution plan of Dataset dataset_1_0: InputDataBuffer[Input] -> TaskPoolMapOperator[ReadQueueStreaming] -> ActorPoolMapOperator[MapBatches(AgentCallable)]
2025-12-31 03:50:31,189	INFO streaming_executor.py:686 -- [dataset]: A new progress UI is available. To enable, set `ray.data.DataContext.get_current().enable_rich_progress_bars = True` and `ray.data.DataContext.get_current().use_ray_tqdm = False`.
2025-12-31 03:50:31,189	INFO progress_bar.py:155 -- Progress bar disabled because std

SUCCESS! Got result: {'job_id': array(['test_001'], dtype=object), 'filename': array(['Red Hot Chili Peppers - Otherside.mp3'], dtype=object), 'audio_size': array([8021583]), 'error': array([nan], dtype=float32)}
