# Test vLLM Streaming Pipeline (Single GPU)

Simple test to verify the streaming pipeline works with vLLM on a single GPU.

In [None]:
import ray
import time
import hashlib
import uuid
from pathlib import Path
from loguru import logger
from ray.util.queue import Queue

# Shutdown any existing Ray instance
ray.shutdown()

In [None]:
# Initialize Ray
ray.init(ignore_reinit_error=True)
print(f"Ray initialized: {ray.cluster_resources()}")

In [None]:
# Import pipeline components
from src.streaming_pipeline import (
    AgentRayComputeConfig,
    AgentStage,
    QueueStreamingDatasource,
    StreamingDatasourceConfig,
    StreamingPipeline,
)
from src.pipelines.instrument_detection.agents.audio_preprocessor import AudioPreprocessorAgent
from src.pipelines.instrument_detection.agents.instrument_detector import InstrumentDetectorAgent

In [None]:
# Configuration
MODEL_NAME = "Qwen/Qwen3-Omni-30B-A3B-Instruct"  # or your cached model path
NUM_TEST_FILES = 50  # Number of audio files to test with
WARMUP_TIMEOUT = 300.0  # 5 minutes for model loading

In [None]:
# Find audio files
audio_dir = Path("../audio_files")
audio_files = list(audio_dir.glob("*.mp3"))[:NUM_TEST_FILES]
print(f"Found {len(audio_files)} audio files to test:")
for f in audio_files:
    print(f"  - {f.name}")

In [None]:
# Helper functions
def create_job_row(filepath: Path) -> dict:
    """Create a job row from an audio file."""
    audio_bytes = filepath.read_bytes()
    return {
        "job_id": f"job_{uuid.uuid4().hex[:8]}",
        "song_id": f"song_{uuid.uuid4().hex[:8]}",
        "song_hash": hashlib.sha256(audio_bytes).hexdigest()[:16],
        "filename": filepath.name,
        "audio_bytes": audio_bytes,
    }

# Identity function for item_to_row (items are already rows)
def identity(x):
    return x

In [None]:
# Create job queue and datasource
job_queue = Queue(maxsize=100)

datasource = QueueStreamingDatasource(
    queue=job_queue,
    item_to_row_fn=identity,
    config=StreamingDatasourceConfig(
        parallelism=1,
        batch_size=1,
        batch_timeout=0.5,
        poll_interval=0.1,
        max_items=NUM_TEST_FILES,
    ),
)
print("Datasource created")

In [None]:
# Create pipeline stages

# Stage 1: Audio Preprocessor (CPU)
preprocessor_stage = AgentStage(
    agent=AudioPreprocessorAgent(target_sr=16000),
    config=AgentRayComputeConfig(
        num_actors=2,
        batch_size=1,
        num_cpus=2.0,
        max_concurrency=1,
    ),
    name="AudioPreprocessor",
)

# Stage 2: Instrument Detector with vLLM (GPU)
detector_stage = AgentStage(
    agent=InstrumentDetectorAgent(
        model_name=MODEL_NAME,
        use_vllm=True,
        tensor_parallel_size=1,  # Single GPU
        gpu_memory_utilization=0.90,
        max_model_len=16384,
        max_num_seqs=4,
    ),
    config=AgentRayComputeConfig(
        num_actors=1,
        batch_size=1,
        num_gpus=1.0,
        max_concurrency=1,
    ),
    name="InstrumentDetector",
)

print("Stages created")

In [None]:
# Create the pipeline
pipeline = StreamingPipeline(
    datasource=datasource,
    stages=[preprocessor_stage, detector_stage],
    name="vLLM_InstrumentDetection",
)
print("Pipeline created")

In [None]:
# Warmup the pipeline (this loads the vLLM model)
def get_warmup_data():
    """Create warmup data using first audio file."""
    warmup_file = audio_files[0]
    audio_bytes = warmup_file.read_bytes()
    return [{
        "job_id": "warmup_001",
        "song_id": "warmup",
        "song_hash": "warmup",
        "filename": warmup_file.name,
        "audio_bytes": audio_bytes,
    }]

print(f"Starting warmup (timeout: {WARMUP_TIMEOUT}s)...")
print("This will load the vLLM model - may take several minutes...")
warmup_start = time.time()

warmup_success = pipeline.warmup(
    warmup_data_fn=get_warmup_data,
    timeout_seconds=WARMUP_TIMEOUT,
)

warmup_time = time.time() - warmup_start
if warmup_success:
    print(f"Warmup complete in {warmup_time:.1f}s - pipeline ready!")
else:
    print(f"Warmup failed or timed out after {warmup_time:.1f}s")

In [None]:
# Submit jobs to the queue
print(f"Submitting {len(audio_files)} jobs...")
for i, audio_file in enumerate(audio_files):
    row = create_job_row(audio_file)
    job_queue.put(row)
    print(f"  [{i+1}/{len(audio_files)}] Submitted: {row['filename']}")
    time.sleep(0.1)  # Small delay between submissions

print(f"All {len(audio_files)} jobs submitted")

In [None]:
# Stream results
results = []
start_time = time.time()

print("Streaming results...")
print("-" * 60)

for batch in pipeline.stream(batch_size=1):
    if not batch:
        continue
    
    # Convert batch to list of dicts
    keys = list(batch.keys())
    if not keys:
        continue
        
    n_items = len(batch[keys[0]])
    for i in range(n_items):
        result = {k: batch[k][i] for k in keys}
        results.append(result)
        
        elapsed = time.time() - start_time
        
        if result.get("error"):
            print(f"[{elapsed:.1f}s] {result['filename']} -> ERROR: {result['error']}")
        else:
            instruments = result.get('instruments', [])
            print(f"[{elapsed:.1f}s] {result['filename']} -> {instruments}")
    
    if len(results) >= NUM_TEST_FILES:
        print("-" * 60)
        print(f"All {NUM_TEST_FILES} results received!")
        break

total_time = time.time() - start_time
print(f"\nTotal streaming time: {total_time:.1f}s")
print(f"Average per file: {total_time / len(results):.1f}s")

In [None]:
# Summary
print("\n" + "=" * 60)
print("RESULTS SUMMARY")
print("=" * 60)

successful = [r for r in results if not r.get("error")]
failed = [r for r in results if r.get("error")]

print(f"Total: {len(results)}")
print(f"Successful: {len(successful)}")
print(f"Failed: {len(failed)}")

print("\nDetailed results:")
for r in results:
    if r.get("error"):
        print(f"  - {r['filename']}: ERROR - {r['error']}")
    else:
        print(f"  - {r['filename']}: {r.get('instruments', [])}")

In [None]:
# Cleanup
pipeline.stop()
ray.shutdown()
print("Cleanup complete")