In [3]:
import ray
import time
import threading
import sys
import hashlib
from pathlib import Path
from ray.util.queue import Queue

ray.shutdown()
time.sleep(1)

PROJECT_ROOT = "/Users/rajsingh/Desktop/code/music_audio_analyzer"
sys.path.append(PROJECT_ROOT)
ray.init(
  runtime_env={
      "working_dir": PROJECT_ROOT,
      "excludes": ["*.mp3", "*.wav", "audio_files/", ".git/", "__pycache__/"],
      "env_vars": {"_CACHE_BUST": str(time.time())}
  }
)

from src.streaming_pipeline import (
  FunctionAgent,
  AgentRayComputeConfig,
  AgentStage,
  QueueStreamingDatasource,
  StreamingDatasourceConfig,
  StreamingPipeline,
)

# Simulated "preprocessing" - just returns size info
def mock_preprocess(items):
  import time
  from loguru import logger

  results = []
  for item in items:
      # Simulate some processing time (100-300ms per item)
      time.sleep(0.2)

      audio_bytes = item.get("audio_bytes", b"")
      results.append({
          "job_id": item["job_id"],
          "filename": item["filename"],
          "audio_size": len(audio_bytes),
          "processed": True,
      })
      logger.info(f"Processed {item['filename'][:30]}... ({len(audio_bytes)} bytes)")

  return results

# Get audio files
AUDIO_DIR = Path(PROJECT_ROOT) / "audio_files"
AUDIO_FILES = list(AUDIO_DIR.glob("*.mp3"))[:8]  # Use 8 files
TOTAL_JOBS = len(AUDIO_FILES)
print(f"Using {TOTAL_JOBS} audio files")

# Create queue
job_queue = Queue(maxsize=100)

datasource = QueueStreamingDatasource(
  queue=job_queue,
  item_to_row_fn=lambda x: x,
  config=StreamingDatasourceConfig(
      batch_size=2,
      batch_timeout=0.5,
      max_items=TOTAL_JOBS,
  ),
)

stage = AgentStage(
  agent=FunctionAgent(process_fn=mock_preprocess),
  config=AgentRayComputeConfig(num_actors=2, batch_size=2),  # 2 parallel workers
  name="MockPreprocessor",
)

pipeline = StreamingPipeline(datasource=datasource, stages=[stage], name="StreamingTest")

# Tracking
events = []
start_time = time.time()

def log_event(event_type, details):
  elapsed = time.time() - start_time
  events.append((elapsed, event_type, details))
  print(f"[{elapsed:6.2f}s] {event_type}: {details}")

# Slow producer - submits jobs 1 second apart
def slow_producer():
  for i, audio_file in enumerate(AUDIO_FILES):
      audio_bytes = audio_file.read_bytes()
      job = {
          "job_id": f"job_{i:03d}",
          "filename": audio_file.name,
          "audio_bytes": audio_bytes,
      }
      job_queue.put(job)
      log_event("SUBMIT", f"job_{i:03d} ({audio_file.name[:25]}...) - {len(audio_bytes)//1024}KB")
      time.sleep(1.0)  # 1 second between submissions
  log_event("PRODUCER_DONE", f"All {TOTAL_JOBS} jobs submitted")

# Start producer
t = threading.Thread(target=slow_producer, daemon=True)
t.start()

# Consume results
results = []
try:
  for batch in pipeline.stream(batch_size=1):
      if batch:
          keys = list(batch.keys())
          n = len(batch[keys[0]]) if keys else 0
          for i in range(n):
              result = {k: batch[k][i] for k in keys}
              results.append(result)
              log_event("RESULT", f"{result['job_id']} - {result['filename'][:25]}... ({result['audio_size']//1024}KB)")

      if len(results) >= TOTAL_JOBS:
          break

except KeyboardInterrupt:
  print("Interrupted")
except Exception as e:
  print(f"ERROR: {e}")
  import traceback
  traceback.print_exc()
finally:
  pipeline.stop()
  t.join(timeout=2)

# Analysis
print("\n" + "=" * 60)
print("STREAMING ANALYSIS")
print("=" * 60)

submit_events = [(t, d) for t, e, d in events if e == "SUBMIT"]
result_events = [(t, d) for t, e, d in events if e == "RESULT"]
producer_done = [t for t, e, d in events if e == "PRODUCER_DONE"]

if result_events and producer_done:
  first_result = result_events[0][0]
  done_time = producer_done[0]
  results_before_done = sum(1 for t, _ in result_events if t < done_time)

  print(f"\nTiming:")
  print(f"  First result at:     {first_result:.2f}s")
  print(f"  Producer done at:    {done_time:.2f}s")
  print(f"  Last result at:      {result_events[-1][0]:.2f}s")

  print(f"\nStreaming metrics:")
  print(f"  Results before producer done: {results_before_done}/{len(result_events)}")

  if first_result < done_time:
      print(f"\n✅ STREAMING CONFIRMED!")
      print(f"   First result arrived {done_time - first_result:.2f}s BEFORE all jobs submitted")
  else:
      print(f"\n❌ NOT STREAMING - results waited for all submissions")

print("\n" + "-" * 60)
print("TIMELINE:")
print("-" * 60)
for elapsed, event_type, details in sorted(events):
  marker = ">>>" if event_type == "RESULT" else "   "
  print(f"{marker} [{elapsed:6.2f}s] {event_type:15s} {details[:50]}")

2025-12-31 03:53:42,949	INFO worker.py:1998 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8268 [39m[22m
2025-12-31 03:53:42,952	INFO packaging.py:392 -- Ignoring upload to cluster for these files: [PosixPath('/Users/rajsingh/Desktop/code/music_audio_analyzer/.gitignore')]
2025-12-31 03:53:42,985	INFO packaging.py:691 -- Creating a file package for local module '/Users/rajsingh/Desktop/code/music_audio_analyzer'.
2025-12-31 03:53:42,986	INFO packaging.py:392 -- Ignoring upload to cluster for these files: [PosixPath('/Users/rajsingh/Desktop/code/music_audio_analyzer/.gitignore')]
2025-12-31 03:53:43,012	INFO packaging.py:463 -- Pushing file package 'gcs://_ray_pkg_48ba2e4be7307197.zip' (0.38MiB) to Ray cluster...
2025-12-31 03:53:43,013	INFO packaging.py:476 -- Successfully pushed file package 'gcs://_ray_pkg_48ba2e4be7307197.zip'.
[32m2025-12-31 03:53:45.399[0m | [1mINFO    [0m | [36msrc.streaming_pipeline.streaming_component[0m:[36mbuild[0m:[36m14

Using 8 audio files
[  1.75s] SUBMIT: job_000 (Red Hot Chili Peppers - O...) - 7833KB


[32m2025-12-31 03:53:47.412[0m | [1mINFO    [0m | [36msrc.streaming_pipeline.streaming_component[0m:[36mbuild[0m:[36m155[0m - [1mAdding stage: MockPreprocessor[0m
2025-12-31 03:53:47,421	INFO logging.py:397 -- Registered dataset logger for dataset dataset_1_0
2025-12-31 03:53:47,428	INFO streaming_executor.py:178 -- Starting execution of Dataset dataset_1_0. Full logs are in /tmp/ray/session_2025-12-31_03-53-39_968160_9277/logs/ray-data
2025-12-31 03:53:47,428	INFO streaming_executor.py:179 -- Execution plan of Dataset dataset_1_0: InputDataBuffer[Input] -> TaskPoolMapOperator[ReadQueueStreaming] -> ActorPoolMapOperator[MapBatches(AgentCallable)]
2025-12-31 03:53:47,450	INFO streaming_executor.py:686 -- [dataset]: A new progress UI is available. To enable, set `ray.data.DataContext.get_current().enable_rich_progress_bars = True` and `ray.data.DataContext.get_current().use_ray_tqdm = False`.
2025-12-31 03:53:47,451	INFO progress_bar.py:155 -- Progress bar disabled because st

[  2.77s] SUBMIT: job_001 (Drake - Drew A Picasso.mp...) - 7747KB
[  3.79s] SUBMIT: job_002 (Chris Brown, Drake - No G...) - 8256KB


[36m(MapWorker(MapBatches(AgentCallable)) pid=9351)[0m 2025-12-31 03:53:49.575 | INFO     | src.streaming_pipeline.agent:__init__:301 - AgentCallable initialized: FunctionAgent
[36m(ReadQueueStreaming->SplitBlocks(200) pid=9353)[0m 2025-12-31 03:53:49.629 | INFO     | src.streaming_pipeline.streaming_datasource:make_block_generator:247 - Generator started. queue=<ray.util.queue.Queue object at 0x1162fc5e0>, max_items=8
[36m(ReadQueueStreaming->SplitBlocks(200) pid=9353)[0m 2025-12-31 03:53:49.638 | DEBUG    | src.streaming_pipeline.streaming_datasource:make_block_generator:281 - Got item #0
[36m(ReadQueueStreaming->SplitBlocks(200) pid=9353)[0m 2025-12-31 03:53:49.649 | DEBUG    | src.streaming_pipeline.streaming_datasource:make_block_generator:281 - Got item #1
[36m(ReadQueueStreaming->SplitBlocks(200) pid=9353)[0m 2025-12-31 03:53:49.655 | DEBUG    | src.streaming_pipeline.streaming_datasource:make_block_generator:334 - Yielding batch with 2 rows, total_read=2
[36m(ReadQue

[  4.80s] RESULT: job_000 - Red Hot Chili Peppers - O... (7833KB)
[  4.80s] RESULT: job_001 - Drake - Drew A Picasso.mp... (7747KB)
[  4.81s] SUBMIT: job_003 (Tom Misch - Isn't She Lov...) - 2638KB


[36m(ReadQueueStreaming->SplitBlocks(200) pid=9353)[0m 2025-12-31 03:53:50.679 | DEBUG    | src.streaming_pipeline.streaming_datasource:make_block_generator:334 - Yielding batch with 1 rows, total_read=4
[36m(MapWorker(MapBatches(AgentCallable)) pid=9351)[0m 2025-12-31 03:53:50.898 | INFO     | __main__:mock_preprocess:48 - Processed Chris Brown, Drake - No Guidan... (8454211 bytes)


[  5.73s] RESULT: job_002 - Chris Brown, Drake - No G... (8256KB)
[  5.73s] RESULT: job_003 - Tom Misch - Isn't She Lov... (2638KB)
[  5.83s] SUBMIT: job_004 (Matt Quentin - Morning De...) - 7129KB


[36m(MapWorker(MapBatches(AgentCallable)) pid=9351)[0m 2025-12-31 03:53:51.100 | INFO     | __main__:mock_preprocess:48 - Processed Tom Misch - Isn't She Lovely.m... (2701416 bytes)
[36m(ReadQueueStreaming->SplitBlocks(200) pid=9353)[0m 2025-12-31 03:53:51.240 | DEBUG    | src.streaming_pipeline.streaming_datasource:make_block_generator:281 - Got item #4
[36m(ReadQueueStreaming->SplitBlocks(200) pid=9353)[0m 2025-12-31 03:53:51.751 | DEBUG    | src.streaming_pipeline.streaming_datasource:make_block_generator:334 - Yielding batch with 1 rows, total_read=5


[  6.84s] SUBMIT: job_005 (Doja Cat - Streets.mp3...) - 8340KB


[36m(ReadQueueStreaming->SplitBlocks(200) pid=9353)[0m 2025-12-31 03:53:52.258 | DEBUG    | src.streaming_pipeline.streaming_datasource:make_block_generator:334 - Yielding batch with 1 rows, total_read=6
[36m(MapWorker(MapBatches(AgentCallable)) pid=9351)[0m 2025-12-31 03:53:52.464 | INFO     | __main__:mock_preprocess:48 - Processed Matt Quentin - Morning Dew.mp3... (7300333 bytes)
2025-12-31 03:53:52,577	INFO progress_bar.py:215 -- ReadQueueStreaming->SplitBlocks(200): Tasks: 1; Actors: 0; Queued blocks: 0 (0.0B); Resources: 1.0 CPU, 28.8MiB object store: Progress Completed 6 / ?
2025-12-31 03:53:52,578	INFO progress_bar.py:215 -- MapBatches(AgentCallable): Tasks: 1; Actors: 2; Queued blocks: 0 (0.0B); Resources: 2.0 CPU, 120.0B object store; [all objects local]: Progress Completed 4 / ?
2025-12-31 03:53:52,579	INFO progress_bar.py:215 -- Running Dataset: dataset_1_0. Active & requested resources: 3/10 CPU, 28.8MiB/1.0GiB object store: Progress Completed 4 / ?
[36m(MapWorker(Map

[  7.31s] RESULT: job_004 - Matt Quentin - Morning De... (7129KB)
[  7.31s] RESULT: job_005 - Doja Cat - Streets.mp3... (8340KB)
[  7.85s] SUBMIT: job_006 (mt. fujitive - sundown.mp...) - 2996KB


[36m(ReadQueueStreaming->SplitBlocks(200) pid=9353)[0m 2025-12-31 03:53:53.319 | DEBUG    | src.streaming_pipeline.streaming_datasource:make_block_generator:334 - Yielding batch with 1 rows, total_read=7


[  8.88s] SUBMIT: job_007 (NAV, Don Toliver - YOU (F...) - 5738KB


[36m(ReadQueueStreaming->SplitBlocks(200) pid=9353)[0m 2025-12-31 03:53:54.376 | DEBUG    | src.streaming_pipeline.streaming_datasource:make_block_generator:334 - Yielding batch with 1 rows, total_read=8
[36m(ReadQueueStreaming->SplitBlocks(200) pid=9353)[0m 2025-12-31 03:53:54.378 | INFO     | src.streaming_pipeline.streaming_datasource:make_block_generator:260 - Reached max_items limit (8)
[36m(ReadQueueStreaming->SplitBlocks(200) pid=9353)[0m 2025-12-31 03:53:54.379 | INFO     | src.streaming_pipeline.streaming_datasource:make_block_generator:345 - StreamingDatasource reader exiting, read 8 items
[36m(MapWorker(MapBatches(AgentCallable)) pid=9351)[0m 2025-12-31 03:53:54.589 | INFO     | __main__:mock_preprocess:48 - Processed mt. fujitive - sundown.mp3... (3068344 bytes)
[36m(MapWorker(MapBatches(AgentCallable)) pid=9352)[0m 2025-12-31 03:53:49.575 | INFO     | src.streaming_pipeline.agent:__init__:301 - AgentCallable initialized: FunctionAgent
[32m2025-12-31 03:53:54.796

[  9.41s] RESULT: job_006 - mt. fujitive - sundown.mp... (2996KB)
[  9.41s] RESULT: job_007 - NAV, Don Toliver - YOU (F... (5738KB)


[36m(MapWorker(MapBatches(AgentCallable)) pid=9351)[0m 2025-12-31 03:53:54.791 | INFO     | __main__:mock_preprocess:48 - Processed NAV, Don Toliver - YOU (FT DON... (5875969 bytes)


[  9.88s] PRODUCER_DONE: All 8 jobs submitted

STREAMING ANALYSIS

Timing:
  First result at:     4.80s
  Producer done at:    9.88s
  Last result at:      9.41s

Streaming metrics:
  Results before producer done: 8/8

✅ STREAMING CONFIRMED!
   First result arrived 5.08s BEFORE all jobs submitted

------------------------------------------------------------
TIMELINE:
------------------------------------------------------------
    [  1.75s] SUBMIT          job_000 (Red Hot Chili Peppers - O...) - 7833KB
    [  2.77s] SUBMIT          job_001 (Drake - Drew A Picasso.mp...) - 7747KB
    [  3.79s] SUBMIT          job_002 (Chris Brown, Drake - No G...) - 8256KB
>>> [  4.80s] RESULT          job_000 - Red Hot Chili Peppers - O... (7833KB)
>>> [  4.80s] RESULT          job_001 - Drake - Drew A Picasso.mp... (7747KB)
    [  4.81s] SUBMIT          job_003 (Tom Misch - Isn't She Lov...) - 2638KB
>>> [  5.73s] RESULT          job_002 - Chris Brown, Drake - No G... (8256KB)
>>> [  5.73s] RESULT   