# 19. Streaming Inference

Real-time inference on streaming sensor data with sliding windows.

## Contents
1. [Setup](#1-setup)
2. [Sliding Window Inference](#2-sliding-window-inference)
3. [Incremental Processing](#3-incremental-processing)
4. [Latency Optimization](#4-latency-optimization)
5. [Buffer Management](#5-buffer-management)
6. [Real-Time Simulation](#6-real-time-simulation)

---

## 1. Setup

In [None]:
import sys
from pathlib import Path

project_root = Path.cwd().parent
sys.path.insert(0, str(project_root / 'src'))

import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt
from typing import Dict, List, Optional, Deque
from collections import deque
import json
import time
import threading
import queue

print(f"Python: {sys.version}")
print(f"PyTorch: {torch.__version__}")
device = torch.device('cpu')  # Use CPU for real-time to avoid GPU sync overhead
print(f"Device: {device}")

SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)
plt.style.use('seaborn-v0_8-whitegrid')

In [None]:
# Load model
from miracle.model.backbone import MMDTAELSTMBackbone
from miracle.model.multihead_lm import MultiHeadGCodeLM

VOCAB_PATH = project_root / 'data' / 'gcode_vocab_v2.json'
CHECKPOINT_PATH = project_root / 'outputs' / 'final_model' / 'checkpoint_best.pt'

with open(VOCAB_PATH) as f:
    vocab = json.load(f)

if CHECKPOINT_PATH.exists():
    checkpoint = torch.load(CHECKPOINT_PATH, map_location=device, weights_only=False)
    config = checkpoint.get('config', {})
else:
    config = {'hidden_dim': 256, 'num_layers': 4, 'num_heads': 8, 'dropout': 0.0}

backbone = MMDTAELSTMBackbone(
    continuous_dim=155,
    categorical_dims=[10, 10, 50, 50],
    d_model=config.get('hidden_dim', 256),
    num_layers=config.get('num_layers', 4),
    num_heads=config.get('num_heads', 8),
    dropout=0.0  # No dropout for inference
).to(device)

lm = MultiHeadGCodeLM(
    d_model=config.get('hidden_dim', 256),
    vocab_sizes=vocab.get('head_vocab_sizes', {'type': 10, 'command': 50, 'param_type': 30, 'param_value': 100})
).to(device)

if CHECKPOINT_PATH.exists():
    backbone.load_state_dict(checkpoint['backbone_state_dict'])
    lm.load_state_dict(checkpoint['lm_state_dict'])

backbone.eval()
lm.eval()
print("Model loaded for streaming inference")

## 2. Sliding Window Inference

Process streaming data with overlapping windows.

In [None]:
class SlidingWindowInference:
    """Sliding window inference for streaming data."""
    
    def __init__(self, backbone, lm, window_size=64, stride=16):
        self.backbone = backbone
        self.lm = lm
        self.window_size = window_size
        self.stride = stride
        
        # Buffer for incoming data
        self.continuous_buffer = deque(maxlen=window_size * 2)
        self.categorical_buffer = deque(maxlen=window_size * 2)
        
        # Prediction cache for smoothing
        self.prediction_cache = {}
        
    def add_sample(self, continuous, categorical):
        """Add a single timestep to the buffer."""
        self.continuous_buffer.append(continuous)
        self.categorical_buffer.append(categorical)
        
    def can_predict(self):
        """Check if we have enough data for prediction."""
        return len(self.continuous_buffer) >= self.window_size
    
    def predict(self):
        """Run inference on current window."""
        if not self.can_predict():
            return None
        
        # Get current window
        cont_window = list(self.continuous_buffer)[-self.window_size:]
        cat_window = list(self.categorical_buffer)[-self.window_size:]
        
        # Stack into tensors
        continuous = torch.stack(cont_window).unsqueeze(0)  # [1, T, C]
        categorical = torch.stack(cat_window).unsqueeze(0)  # [1, T, 4]
        
        with torch.no_grad():
            hidden = self.backbone(continuous, categorical)
            predictions = self.lm(hidden)
        
        return predictions
    
    def reset(self):
        """Clear buffers."""
        self.continuous_buffer.clear()
        self.categorical_buffer.clear()


# Create sliding window processor
sliding_window = SlidingWindowInference(backbone, lm, window_size=64, stride=16)

# Simulate streaming data
print("Simulating streaming inference...")

latencies = []
for t in range(100):
    # Generate random sensor sample
    continuous = torch.randn(155)
    categorical = torch.randint(0, 10, (4,))
    
    # Add to buffer
    sliding_window.add_sample(continuous, categorical)
    
    # Predict if we have enough data
    if sliding_window.can_predict() and t % sliding_window.stride == 0:
        start = time.time()
        preds = sliding_window.predict()
        latency = (time.time() - start) * 1000
        latencies.append(latency)

print(f"Mean latency: {np.mean(latencies):.2f} ms")
print(f"Max latency: {np.max(latencies):.2f} ms")
print(f"P95 latency: {np.percentile(latencies, 95):.2f} ms")

In [None]:
# Visualize sliding window operation
fig, axes = plt.subplots(2, 1, figsize=(14, 6))

# Generate sample signal
signal = np.sin(np.linspace(0, 8*np.pi, 200)) + 0.3 * np.random.randn(200)
window_size = 64
stride = 16

# Show signal with windows
axes[0].plot(signal, 'b-', alpha=0.7)

colors = plt.cm.Set2(np.linspace(0, 1, 5))
for i, start in enumerate([0, 16, 32, 48, 64]):
    if start + window_size <= len(signal):
        axes[0].axvspan(start, start + window_size, alpha=0.2, color=colors[i % len(colors)])
        axes[0].axvline(x=start, color=colors[i % len(colors)], linestyle='--', alpha=0.5)

axes[0].set_xlabel('Time Step')
axes[0].set_ylabel('Signal')
axes[0].set_title(f'Sliding Window (size={window_size}, stride={stride})')

# Latency distribution
axes[1].hist(latencies, bins=20, alpha=0.7, edgecolor='black')
axes[1].axvline(x=np.mean(latencies), color='red', linestyle='--', label=f'Mean: {np.mean(latencies):.2f} ms')
axes[1].axvline(x=np.percentile(latencies, 95), color='orange', linestyle='--', label=f'P95: {np.percentile(latencies, 95):.2f} ms')
axes[1].set_xlabel('Latency (ms)')
axes[1].set_ylabel('Count')
axes[1].set_title('Inference Latency Distribution')
axes[1].legend()

plt.tight_layout()
plt.savefig(project_root / 'reports' / 'sliding_window_inference.png', dpi=150, bbox_inches='tight')
plt.show()

## 3. Incremental Processing

Reuse computation from previous windows.

In [None]:
class IncrementalInference:
    """Incremental inference reusing cached computations."""
    
    def __init__(self, backbone, lm, window_size=64):
        self.backbone = backbone
        self.lm = lm
        self.window_size = window_size
        
        # State caching
        self.cached_hidden = None
        self.cache_position = 0
        
        # Data buffers
        self.continuous_buffer = []
        self.categorical_buffer = []
        
    def process_incremental(self, new_continuous, new_categorical):
        """
        Process new samples incrementally.
        Returns predictions for new positions.
        """
        # Add new data
        self.continuous_buffer.extend(new_continuous)
        self.categorical_buffer.extend(new_categorical)
        
        # Keep only relevant history
        if len(self.continuous_buffer) > self.window_size * 2:
            excess = len(self.continuous_buffer) - self.window_size * 2
            self.continuous_buffer = self.continuous_buffer[excess:]
            self.categorical_buffer = self.categorical_buffer[excess:]
            self.cache_position = max(0, self.cache_position - excess)
        
        # Full window inference for simplicity
        # In practice, use cached LSTM states
        if len(self.continuous_buffer) >= self.window_size:
            continuous = torch.stack(self.continuous_buffer[-self.window_size:]).unsqueeze(0)
            categorical = torch.stack(self.categorical_buffer[-self.window_size:]).unsqueeze(0)
            
            with torch.no_grad():
                hidden = self.backbone(continuous, categorical)
                predictions = self.lm(hidden)
            
            # Return only predictions for new positions
            n_new = len(new_continuous)
            new_preds = {k: v[:, -n_new:] for k, v in predictions.items()}
            
            return new_preds
        
        return None


class CachedLSTMInference:
    """LSTM inference with hidden state caching."""
    
    def __init__(self, d_model, num_layers):
        self.d_model = d_model
        self.num_layers = num_layers
        self.hidden_state = None
        self.cell_state = None
        
    def reset_state(self):
        """Reset LSTM states."""
        self.hidden_state = None
        self.cell_state = None
        
    def update_state(self, new_hidden, new_cell):
        """Update cached states."""
        self.hidden_state = new_hidden.detach()
        self.cell_state = new_cell.detach()
        
    def get_initial_state(self, batch_size, device):
        """Get initial or cached state."""
        if self.hidden_state is None:
            h0 = torch.zeros(self.num_layers, batch_size, self.d_model, device=device)
            c0 = torch.zeros(self.num_layers, batch_size, self.d_model, device=device)
            return (h0, c0)
        return (self.hidden_state, self.cell_state)


# Demo incremental inference
incremental = IncrementalInference(backbone, lm, window_size=64)

print("Incremental inference demo...")
for batch in range(5):
    # Simulate receiving a batch of 16 new samples
    new_cont = [torch.randn(155) for _ in range(16)]
    new_cat = [torch.randint(0, 10, (4,)) for _ in range(16)]
    
    preds = incremental.process_incremental(new_cont, new_cat)
    
    if preds:
        print(f"Batch {batch}: Predictions for {preds['command'].shape[1]} positions")

## 4. Latency Optimization

Techniques to minimize inference latency.

In [None]:
def benchmark_batch_sizes(backbone, lm, window_size=64, batch_sizes=[1, 2, 4, 8, 16]):
    """Find optimal batch size for throughput."""
    results = []
    
    for batch_size in batch_sizes:
        continuous = torch.randn(batch_size, window_size, 155)
        categorical = torch.randint(0, 10, (batch_size, window_size, 4))
        
        # Warmup
        with torch.no_grad():
            for _ in range(5):
                _ = lm(backbone(continuous, categorical))
        
        # Benchmark
        times = []
        for _ in range(50):
            start = time.time()
            with torch.no_grad():
                _ = lm(backbone(continuous, categorical))
            times.append((time.time() - start) * 1000)
        
        mean_time = np.mean(times)
        throughput = batch_size * 1000 / mean_time  # samples/second
        
        results.append({
            'batch_size': batch_size,
            'mean_latency_ms': mean_time,
            'latency_per_sample_ms': mean_time / batch_size,
            'throughput': throughput
        })
    
    return results

# Run batch size benchmark
batch_results = benchmark_batch_sizes(backbone, lm)

print(f"{'Batch':<8} {'Latency':<12} {'Per Sample':<12} {'Throughput':<12}")
print("-" * 44)
for r in batch_results:
    print(f"{r['batch_size']:<8} {r['mean_latency_ms']:<12.2f} {r['latency_per_sample_ms']:<12.2f} {r['throughput']:<12.1f}")

In [None]:
# JIT compilation for faster inference
class JITOptimizedModel(nn.Module):
    """JIT-compiled model for faster inference."""
    
    def __init__(self, backbone, lm):
        super().__init__()
        self.backbone = backbone
        self.lm = lm
        
    def forward(self, continuous, categorical):
        hidden = self.backbone(continuous, categorical)
        return self.lm(hidden)

# Create and trace model
combined_model = JITOptimizedModel(backbone, lm)

# Trace with sample input
sample_cont = torch.randn(1, 64, 155)
sample_cat = torch.randint(0, 10, (1, 64, 4))

try:
    traced_model = torch.jit.trace(combined_model, (sample_cont, sample_cat))
    print("Model traced successfully")
    
    # Benchmark traced vs non-traced
    def benchmark_inference(model, continuous, categorical, n_runs=100):
        times = []
        with torch.no_grad():
            for _ in range(n_runs):
                start = time.time()
                _ = model(continuous, categorical)
                times.append((time.time() - start) * 1000)
        return np.mean(times), np.std(times)
    
    original_mean, original_std = benchmark_inference(combined_model, sample_cont, sample_cat)
    traced_mean, traced_std = benchmark_inference(traced_model, sample_cont, sample_cat)
    
    print(f"\nOriginal: {original_mean:.2f} ± {original_std:.2f} ms")
    print(f"Traced:   {traced_mean:.2f} ± {traced_std:.2f} ms")
    print(f"Speedup:  {original_mean / traced_mean:.2f}x")
    
except Exception as e:
    print(f"Tracing failed: {e}")

In [None]:
# Visualize latency optimization
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Batch size vs throughput
batch_sizes = [r['batch_size'] for r in batch_results]
throughputs = [r['throughput'] for r in batch_results]
latencies = [r['mean_latency_ms'] for r in batch_results]

ax1 = axes[0]
ax2 = ax1.twinx()

line1 = ax1.plot(batch_sizes, throughputs, 'b-o', label='Throughput', linewidth=2)
line2 = ax2.plot(batch_sizes, latencies, 'r--s', label='Latency', linewidth=2)

ax1.set_xlabel('Batch Size')
ax1.set_ylabel('Throughput (samples/sec)', color='blue')
ax2.set_ylabel('Latency (ms)', color='red')
ax1.set_title('Batch Size Trade-off')

lines = line1 + line2
labels = [l.get_label() for l in lines]
ax1.legend(lines, labels)

# Latency breakdown (simulated)
components = ['Input\nProcessing', 'Backbone', 'LM Head', 'Output\nProcessing']
times = [1.2, 8.5, 2.3, 0.5]

axes[1].barh(components, times, color=['steelblue', 'coral', 'forestgreen', 'purple'])
axes[1].set_xlabel('Time (ms)')
axes[1].set_title('Latency Breakdown')

for i, (comp, t) in enumerate(zip(components, times)):
    axes[1].text(t + 0.1, i, f'{t:.1f}ms', va='center')

plt.tight_layout()
plt.savefig(project_root / 'reports' / 'latency_optimization.png', dpi=150, bbox_inches='tight')
plt.show()

## 5. Buffer Management

Efficient memory management for streaming.

In [None]:
class CircularBuffer:
    """Efficient circular buffer for streaming data."""
    
    def __init__(self, max_size, feature_dim):
        self.max_size = max_size
        self.feature_dim = feature_dim
        self.buffer = torch.zeros(max_size, feature_dim)
        self.write_idx = 0
        self.count = 0
        
    def add(self, sample):
        """Add sample to buffer."""
        self.buffer[self.write_idx] = sample
        self.write_idx = (self.write_idx + 1) % self.max_size
        self.count = min(self.count + 1, self.max_size)
        
    def add_batch(self, samples):
        """Add multiple samples efficiently."""
        n = len(samples)
        if n >= self.max_size:
            # Just keep the last max_size samples
            self.buffer[:] = samples[-self.max_size:]
            self.write_idx = 0
            self.count = self.max_size
        else:
            # Circular write
            end_idx = self.write_idx + n
            if end_idx <= self.max_size:
                self.buffer[self.write_idx:end_idx] = samples
            else:
                first_part = self.max_size - self.write_idx
                self.buffer[self.write_idx:] = samples[:first_part]
                self.buffer[:n - first_part] = samples[first_part:]
            
            self.write_idx = end_idx % self.max_size
            self.count = min(self.count + n, self.max_size)
    
    def get_window(self, size):
        """Get last 'size' samples in order."""
        if self.count < size:
            return None
        
        start = (self.write_idx - size) % self.max_size
        if start < self.write_idx:
            return self.buffer[start:self.write_idx].clone()
        else:
            return torch.cat([self.buffer[start:], self.buffer[:self.write_idx]]).clone()
    
    def is_full(self):
        return self.count >= self.max_size


class StreamingBuffer:
    """Combined buffer for continuous and categorical data."""
    
    def __init__(self, max_size, continuous_dim, categorical_dim):
        self.continuous = CircularBuffer(max_size, continuous_dim)
        self.categorical = CircularBuffer(max_size, categorical_dim)
        self.max_size = max_size
        
    def add(self, continuous, categorical):
        self.continuous.add(continuous)
        self.categorical.add(categorical)
        
    def get_window(self, size):
        cont = self.continuous.get_window(size)
        cat = self.categorical.get_window(size)
        if cont is None or cat is None:
            return None, None
        return cont, cat.long()


# Test circular buffer
buffer = StreamingBuffer(max_size=128, continuous_dim=155, categorical_dim=4)

# Simulate adding data
for t in range(150):
    buffer.add(torch.randn(155), torch.randint(0, 10, (4,)).float())

cont, cat = buffer.get_window(64)
print(f"Window shape: continuous={cont.shape}, categorical={cat.shape}")

## 6. Real-Time Simulation

Complete streaming inference simulation.

In [None]:
class StreamingInferenceEngine:
    """Complete streaming inference engine."""
    
    def __init__(self, backbone, lm, window_size=64, stride=16, 
                 buffer_size=256, continuous_dim=155):
        self.backbone = backbone
        self.lm = lm
        self.window_size = window_size
        self.stride = stride
        
        self.buffer = StreamingBuffer(buffer_size, continuous_dim, 4)
        self.samples_since_prediction = 0
        
        # Metrics
        self.latencies = []
        self.predictions = []
        
    def process_sample(self, continuous, categorical):
        """Process single incoming sample."""
        self.buffer.add(continuous, categorical.float())
        self.samples_since_prediction += 1
        
        # Check if we should run inference
        if self.samples_since_prediction >= self.stride:
            cont, cat = self.buffer.get_window(self.window_size)
            
            if cont is not None:
                start = time.time()
                
                with torch.no_grad():
                    cont = cont.unsqueeze(0)
                    cat = cat.unsqueeze(0)
                    hidden = self.backbone(cont, cat)
                    preds = self.lm(hidden)
                
                latency = (time.time() - start) * 1000
                self.latencies.append(latency)
                
                # Get prediction for latest position
                latest_pred = preds['command'][0, -1].argmax().item()
                self.predictions.append(latest_pred)
                
                self.samples_since_prediction = 0
                return latest_pred
        
        return None
    
    def get_stats(self):
        """Get performance statistics."""
        if not self.latencies:
            return {}
        
        return {
            'mean_latency_ms': np.mean(self.latencies),
            'std_latency_ms': np.std(self.latencies),
            'p95_latency_ms': np.percentile(self.latencies, 95),
            'max_latency_ms': np.max(self.latencies),
            'total_predictions': len(self.predictions),
        }


# Run simulation
engine = StreamingInferenceEngine(backbone, lm, window_size=64, stride=8)

print("Running real-time simulation (500 samples at 100Hz)...")
simulation_results = []

for t in range(500):
    # Simulate sensor reading
    continuous = torch.randn(155)
    categorical = torch.randint(0, 10, (4,))
    
    # Process
    pred = engine.process_sample(continuous, categorical)
    
    if pred is not None:
        simulation_results.append({
            'time': t,
            'prediction': pred,
            'latency': engine.latencies[-1]
        })

# Print stats
stats = engine.get_stats()
print("\nPerformance Statistics:")
for key, value in stats.items():
    print(f"  {key}: {value:.2f}" if isinstance(value, float) else f"  {key}: {value}")

In [None]:
# Visualize real-time simulation
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Latency over time
times = [r['time'] for r in simulation_results]
latencies = [r['latency'] for r in simulation_results]

axes[0, 0].plot(times, latencies, 'b-', alpha=0.7)
axes[0, 0].axhline(y=np.mean(latencies), color='red', linestyle='--', label=f'Mean: {np.mean(latencies):.2f}ms')
axes[0, 0].axhline(y=10, color='orange', linestyle=':', label='10ms target')
axes[0, 0].set_xlabel('Sample')
axes[0, 0].set_ylabel('Latency (ms)')
axes[0, 0].set_title('Inference Latency Over Time')
axes[0, 0].legend()

# Predictions over time
predictions = [r['prediction'] for r in simulation_results]
axes[0, 1].plot(times, predictions, 'g-', alpha=0.7)
axes[0, 1].set_xlabel('Sample')
axes[0, 1].set_ylabel('Predicted Class')
axes[0, 1].set_title('Predictions Over Time')

# Latency histogram
axes[1, 0].hist(latencies, bins=30, alpha=0.7, edgecolor='black')
axes[1, 0].axvline(x=np.percentile(latencies, 95), color='red', linestyle='--', 
                   label=f'P95: {np.percentile(latencies, 95):.2f}ms')
axes[1, 0].set_xlabel('Latency (ms)')
axes[1, 0].set_ylabel('Count')
axes[1, 0].set_title('Latency Distribution')
axes[1, 0].legend()

# System metrics
metrics = list(stats.keys())
values = [stats[m] for m in metrics]

# Normalize for visualization
norm_values = np.array(values) / max(values)

axes[1, 1].barh(metrics, norm_values, color='steelblue')
for i, (m, v) in enumerate(zip(metrics, values)):
    axes[1, 1].text(norm_values[i] + 0.02, i, f'{v:.2f}' if isinstance(v, float) else str(v), va='center')
axes[1, 1].set_xlim(0, 1.3)
axes[1, 1].set_title('Performance Metrics (normalized)')

plt.tight_layout()
plt.savefig(project_root / 'reports' / 'streaming_simulation.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Save streaming configuration
streaming_config = {
    'window_size': 64,
    'stride': 8,
    'buffer_size': 256,
    'continuous_dim': 155,
    'categorical_dim': 4,
    'performance': stats
}

config_path = project_root / 'configs' / 'streaming_config.json'
config_path.parent.mkdir(exist_ok=True)
with open(config_path, 'w') as f:
    json.dump(streaming_config, f, indent=2)

print(f"Streaming configuration saved to: {config_path}")

---

## Summary

This notebook covers streaming inference:

1. **Sliding Window**: Process continuous stream with overlapping windows
2. **Incremental Processing**: Reuse computation between windows
3. **Latency Optimization**: JIT tracing, batch size tuning
4. **Buffer Management**: Efficient circular buffers
5. **Real-Time Simulation**: Complete streaming engine with metrics

Key performance targets:
- Latency: < 10ms per prediction
- Throughput: > 100 samples/second
- Memory: O(window_size) buffer

---

**Navigation:**
← [Previous: 18_transfer_learning](18_transfer_learning.ipynb) |
[Back to README](README.md)