# QTAU with Ray: Distributed Computing Examples

This notebook demonstrates how to use QTAU with the **Ray** execution engine for distributed computing tasks.

## Overview

Ray is a high-performance distributed computing framework that excels at:
- Actor-based computations
- GPU-accelerated workloads
- Large-scale machine learning
- Low-latency task execution

### Table of Contents
1. [Setup and Configuration](#setup)
2. [Basic Ray Task Distribution](#basic)
3. [Resource-Aware Task Scheduling](#resources)
4. [GPU Task Execution](#gpu)
5. [Throughput Scaling Analysis](#throughput)
6. [Actor Patterns](#actors)

## 1. Setup and Configuration <a id="setup"></a>

In [None]:
# Import required libraries
import os
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# Set plotting style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("Set2")

# QTAU imports
from qtau.pilot_compute_service import PilotComputeService, ExecutionEngine

print("Imports successful!")
print(f"Current time: {datetime.now()}")

In [None]:
# Configuration for Ray execution
RESOURCE_URL = "ssh://localhost"
WORKING_DIRECTORY = os.path.join(os.environ["HOME"], "work", "qtau_ray")

# Create working directory
os.makedirs(WORKING_DIRECTORY, exist_ok=True)

# Ray pilot configuration
pilot_compute_description_ray = {
    "resource": RESOURCE_URL,
    "working_directory": WORKING_DIRECTORY,
    "type": "ray",
    "number_of_nodes": 2,
    "cores_per_node": 8,
    "gpus_per_node": 0,  # Set to number of GPUs if available
}

# Ray with SLURM configuration (for HPC clusters)
pilot_compute_description_ray_slurm = {
    "resource": "slurm://localhost",
    "working_directory": WORKING_DIRECTORY,
    "type": "ray",
    "number_of_nodes": 4,
    "cores_per_node": 64,
    "gpus_per_node": 4,
    "queue": "regular",
    "walltime": 30,
    "project": "myproject",
    "scheduler_script_commands": [
        "#SBATCH --constraint=gpu",
        "#SBATCH --gpus-per-task=1"
    ]
}

print("Ray Configuration:")
for key, value in pilot_compute_description_ray.items():
    print(f"  {key}: {value}")

### Starting QTAU with Ray (Production Code)

```python
# Initialize PilotComputeService with Ray engine
pcs = PilotComputeService(
    execution_engine=ExecutionEngine.RAY,
    working_directory=WORKING_DIRECTORY
)

# Create pilot
pilot = pcs.create_pilot(pilot_compute_description=pilot_compute_description_ray)
pilot.wait()

# Submit tasks with resource requirements
task = pcs.submit_task(
    my_function, 
    arg1, arg2,
    resources={'num_cpus': 1, 'num_gpus': 0, 'memory': None}
)

# Get results
pcs.wait_tasks([task])
result = pcs.get_results([task])
```

## 2. Basic Ray Task Distribution <a id="basic"></a>

Demonstrates basic task submission patterns with Ray.

In [None]:
# Define computational tasks
def monte_carlo_pi(n_samples):
    """Estimate Pi using Monte Carlo method."""
    x = np.random.uniform(-1, 1, n_samples)
    y = np.random.uniform(-1, 1, n_samples)
    inside_circle = np.sum(x**2 + y**2 <= 1)
    return 4 * inside_circle / n_samples

def numerical_integration(func, a, b, n_points):
    """Numerical integration using trapezoidal rule."""
    x = np.linspace(a, b, n_points)
    y = func(x)
    return np.trapz(y, x)

def simulate_random_walk(n_steps, n_walkers):
    """Simulate multiple random walks."""
    walks = np.cumsum(np.random.choice([-1, 1], size=(n_walkers, n_steps)), axis=1)
    return {
        'final_positions': walks[:, -1].tolist(),
        'max_displacement': float(np.max(np.abs(walks))),
        'mean_squared_displacement': float(np.mean(walks[:, -1]**2))
    }

In [None]:
# Monte Carlo Pi estimation with increasing samples
np.random.seed(42)
sample_sizes = [1000, 10000, 100000, 500000, 1000000, 5000000]
n_trials = 10

pi_results = []
for n_samples in sample_sizes:
    estimates = []
    times = []
    for _ in range(n_trials):
        start = time.time()
        pi_est = monte_carlo_pi(n_samples)
        elapsed = time.time() - start
        estimates.append(pi_est)
        times.append(elapsed)
    
    pi_results.append({
        'n_samples': n_samples,
        'pi_estimate': np.mean(estimates),
        'std_error': np.std(estimates),
        'error': abs(np.mean(estimates) - np.pi),
        'mean_time': np.mean(times),
        'samples_per_sec': n_samples / np.mean(times)
    })

pi_df = pd.DataFrame(pi_results)
pi_df

In [None]:
# Visualize Monte Carlo Pi estimation
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# 1. Pi estimate convergence
ax1 = axes[0, 0]
ax1.errorbar(pi_df['n_samples'], pi_df['pi_estimate'], yerr=pi_df['std_error'],
             fmt='o-', capsize=5, linewidth=2, markersize=8, color='steelblue')
ax1.axhline(np.pi, color='red', linestyle='--', label=f'True π = {np.pi:.6f}')
ax1.set_xscale('log')
ax1.set_xlabel('Number of Samples')
ax1.set_ylabel('Estimated π')
ax1.set_title('Monte Carlo π Estimation Convergence')
ax1.legend()

# 2. Error vs samples (log-log)
ax2 = axes[0, 1]
ax2.loglog(pi_df['n_samples'], pi_df['error'], 'go-', linewidth=2, markersize=8, label='Actual Error')
# Theoretical 1/sqrt(n) convergence
n_ref = np.array(sample_sizes)
theoretical_error = 1.0 / np.sqrt(n_ref) * pi_df['error'].iloc[0] * np.sqrt(sample_sizes[0])
ax2.loglog(n_ref, theoretical_error, 'r--', alpha=0.7, label='1/√n theoretical')
ax2.set_xlabel('Number of Samples')
ax2.set_ylabel('Absolute Error')
ax2.set_title('Error Convergence Rate')
ax2.legend()
ax2.grid(True, which="both", ls="-", alpha=0.2)

# 3. Throughput
ax3 = axes[1, 0]
ax3.bar(range(len(pi_df)), pi_df['samples_per_sec'] / 1e6, color='coral')
ax3.set_xticks(range(len(pi_df)))
ax3.set_xticklabels([f'{n/1e6:.1f}M' if n >= 1e6 else f'{n/1e3:.0f}K' 
                     for n in pi_df['n_samples']], rotation=45)
ax3.set_xlabel('Sample Size')
ax3.set_ylabel('Throughput (Million samples/sec)')
ax3.set_title('Computational Throughput')

# 4. Execution time
ax4 = axes[1, 1]
ax4.semilogy(pi_df['n_samples'], pi_df['mean_time'], 'mo-', linewidth=2, markersize=8)
ax4.set_xscale('log')
ax4.set_xlabel('Number of Samples')
ax4.set_ylabel('Execution Time (seconds)')
ax4.set_title('Execution Time Scaling')
ax4.grid(True, which="both", ls="-", alpha=0.2)

plt.tight_layout()
plt.savefig('ray_monte_carlo_pi.png', dpi=150, bbox_inches='tight')
plt.show()

print(f"\nBest π estimate: {pi_df.iloc[-1]['pi_estimate']:.6f} (error: {pi_df.iloc[-1]['error']:.6f})")

## 3. Resource-Aware Task Scheduling <a id="resources"></a>

Ray allows fine-grained resource allocation per task.

In [None]:
# Simulate resource-aware task scheduling
# Different tasks with different resource requirements

task_types = [
    {'name': 'light_cpu', 'cpus': 1, 'memory_gb': 1, 'duration': 0.5},
    {'name': 'heavy_cpu', 'cpus': 4, 'memory_gb': 2, 'duration': 2.0},
    {'name': 'memory_intensive', 'cpus': 2, 'memory_gb': 8, 'duration': 1.5},
    {'name': 'gpu_task', 'cpus': 1, 'gpus': 1, 'memory_gb': 4, 'duration': 1.0},
]

# Simulate scheduling on a cluster with 16 CPUs, 32GB RAM, 2 GPUs
cluster_resources = {'cpus': 16, 'memory_gb': 32, 'gpus': 2}

# Generate task schedule
np.random.seed(42)
n_tasks = 50

scheduled_tasks = []
current_time = 0
resource_timeline = []

for i in range(n_tasks):
    task_type = task_types[np.random.randint(0, len(task_types))]
    
    # Add some randomness to duration
    actual_duration = task_type['duration'] * (0.8 + 0.4 * np.random.random())
    
    scheduled_tasks.append({
        'task_id': f'task_{i}',
        'type': task_type['name'],
        'cpus': task_type['cpus'],
        'memory_gb': task_type.get('memory_gb', 0),
        'gpus': task_type.get('gpus', 0),
        'duration': actual_duration,
        'start_time': current_time + np.random.exponential(0.2),
    })
    current_time = scheduled_tasks[-1]['start_time']

tasks_df = pd.DataFrame(scheduled_tasks)
tasks_df['end_time'] = tasks_df['start_time'] + tasks_df['duration']
tasks_df.head(10)

In [None]:
# Visualize resource-aware scheduling
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# 1. Task type distribution
ax1 = axes[0, 0]
type_counts = tasks_df['type'].value_counts()
colors = plt.cm.Set2(range(len(type_counts)))
ax1.pie(type_counts.values, labels=type_counts.index, autopct='%1.1f%%',
        colors=colors, explode=[0.02]*len(type_counts))
ax1.set_title('Task Type Distribution')

# 2. Resource requirements by task type
ax2 = axes[0, 1]
resource_summary = tasks_df.groupby('type').agg({
    'cpus': 'mean',
    'memory_gb': 'mean',
    'gpus': 'mean',
    'duration': 'mean'
}).reset_index()

x = np.arange(len(resource_summary))
width = 0.25
ax2.bar(x - width, resource_summary['cpus'], width, label='CPUs', color='steelblue')
ax2.bar(x, resource_summary['memory_gb'], width, label='Memory (GB)', color='coral')
ax2.bar(x + width, resource_summary['gpus'] * 4, width, label='GPUs (×4)', color='green')
ax2.set_xticks(x)
ax2.set_xticklabels(resource_summary['type'], rotation=45, ha='right')
ax2.set_ylabel('Resource Units')
ax2.set_title('Average Resource Requirements by Task Type')
ax2.legend()

# 3. Task execution timeline (Gantt chart)
ax3 = axes[1, 0]
type_colors = {'light_cpu': 'lightblue', 'heavy_cpu': 'steelblue', 
               'memory_intensive': 'coral', 'gpu_task': 'lightgreen'}

# Show first 20 tasks for clarity
for i, (_, task) in enumerate(tasks_df.head(20).iterrows()):
    ax3.barh(i, task['duration'], left=task['start_time'],
             color=type_colors[task['type']], edgecolor='black', linewidth=0.5)

ax3.set_yticks(range(20))
ax3.set_yticklabels([f"{t['type'][:6]}" for _, t in tasks_df.head(20).iterrows()], fontsize=8)
ax3.set_xlabel('Time (seconds)')
ax3.set_ylabel('Task')
ax3.set_title('Task Execution Timeline (First 20 Tasks)')

# Add legend
from matplotlib.patches import Patch
legend_elements = [Patch(facecolor=c, label=t) for t, c in type_colors.items()]
ax3.legend(handles=legend_elements, loc='lower right', fontsize=8)

# 4. Duration distribution by type
ax4 = axes[1, 1]
tasks_df.boxplot(column='duration', by='type', ax=ax4)
ax4.set_xlabel('Task Type')
ax4.set_ylabel('Duration (seconds)')
ax4.set_title('Task Duration Distribution by Type')
plt.suptitle('')  # Remove automatic title

plt.tight_layout()
plt.savefig('ray_resource_scheduling.png', dpi=150, bbox_inches='tight')
plt.show()

print("\nResource Summary:")
print(f"Total CPU-seconds: {(tasks_df['cpus'] * tasks_df['duration']).sum():.1f}")
print(f"Total GPU-seconds: {(tasks_df['gpus'] * tasks_df['duration']).sum():.1f}")
print(f"Total memory-GB-seconds: {(tasks_df['memory_gb'] * tasks_df['duration']).sum():.1f}")

## 4. GPU Task Execution <a id="gpu"></a>

Ray with QTAU enables easy GPU task distribution.

In [None]:
# Simulate GPU vs CPU performance comparison
# In practice, you would use actual GPU computations

def simulate_gpu_speedup(operation, size):
    """Simulate GPU speedup for different operations."""
    # Simulated speedup factors based on typical GPU performance
    speedup_factors = {
        'matrix_multiply': 20 + 5 * np.log10(size/100),
        'fft': 15 + 3 * np.log10(size/100),
        'neural_network': 50 + 10 * np.log10(size/100),
        'monte_carlo': 30 + 8 * np.log10(size/100),
    }
    return max(1, speedup_factors.get(operation, 10))

# Generate comparison data
operations = ['matrix_multiply', 'fft', 'neural_network', 'monte_carlo']
sizes = [100, 500, 1000, 2000, 5000, 10000]

gpu_comparison = []
for op in operations:
    for size in sizes:
        cpu_time = size**2 / 1e8  # Simplified CPU time model
        speedup = simulate_gpu_speedup(op, size)
        gpu_time = cpu_time / speedup
        
        gpu_comparison.append({
            'operation': op,
            'size': size,
            'cpu_time': cpu_time,
            'gpu_time': gpu_time,
            'speedup': speedup
        })

gpu_df = pd.DataFrame(gpu_comparison)
gpu_df.head(12)

In [None]:
# Visualize GPU performance
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# 1. Speedup by operation and size
ax1 = axes[0, 0]
for op in operations:
    op_data = gpu_df[gpu_df['operation'] == op]
    ax1.plot(op_data['size'], op_data['speedup'], 'o-', label=op.replace('_', ' ').title(),
             linewidth=2, markersize=8)
ax1.set_xlabel('Problem Size')
ax1.set_ylabel('GPU Speedup (×)')
ax1.set_title('GPU Speedup by Operation Type')
ax1.legend()
ax1.set_xscale('log')

# 2. CPU vs GPU time comparison
ax2 = axes[0, 1]
pivot_cpu = gpu_df.pivot(index='size', columns='operation', values='cpu_time')
pivot_gpu = gpu_df.pivot(index='size', columns='operation', values='gpu_time')

x = np.arange(len(sizes))
width = 0.4
ax2.bar(x - width/2, pivot_cpu['matrix_multiply'], width, label='CPU', color='steelblue', alpha=0.8)
ax2.bar(x + width/2, pivot_gpu['matrix_multiply'], width, label='GPU', color='green', alpha=0.8)
ax2.set_xticks(x)
ax2.set_xticklabels(sizes)
ax2.set_xlabel('Problem Size')
ax2.set_ylabel('Execution Time (seconds)')
ax2.set_title('CPU vs GPU: Matrix Multiplication')
ax2.legend()
ax2.set_yscale('log')

# 3. Speedup heatmap
ax3 = axes[1, 0]
pivot_speedup = gpu_df.pivot(index='operation', columns='size', values='speedup')
sns.heatmap(pivot_speedup, annot=True, fmt='.1f', cmap='YlGnBu', ax=ax3,
            cbar_kws={'label': 'Speedup (×)'})
ax3.set_xlabel('Problem Size')
ax3.set_ylabel('Operation')
ax3.set_title('GPU Speedup Heatmap')

# 4. Time savings
ax4 = axes[1, 1]
gpu_df['time_saved'] = gpu_df['cpu_time'] - gpu_df['gpu_time']
gpu_df['pct_saved'] = (gpu_df['time_saved'] / gpu_df['cpu_time']) * 100

for op in operations:
    op_data = gpu_df[gpu_df['operation'] == op]
    ax4.plot(op_data['size'], op_data['pct_saved'], 'o-', label=op.replace('_', ' ').title(),
             linewidth=2, markersize=8)

ax4.set_xlabel('Problem Size')
ax4.set_ylabel('Time Saved (%)')
ax4.set_title('Percentage of Time Saved with GPU')
ax4.legend()
ax4.set_xscale('log')
ax4.set_ylim(90, 100)

plt.tight_layout()
plt.savefig('ray_gpu_performance.png', dpi=150, bbox_inches='tight')
plt.show()

print("\nGPU Performance Summary:")
print(f"Average speedup: {gpu_df['speedup'].mean():.1f}×")
print(f"Max speedup: {gpu_df['speedup'].max():.1f}× ({gpu_df.loc[gpu_df['speedup'].idxmax(), 'operation']})")
print(f"Average time saved: {gpu_df['pct_saved'].mean():.1f}%")

## 5. Throughput Scaling Analysis <a id="throughput"></a>

Analyze how throughput scales with cluster size and task count.

In [None]:
# Simulate throughput benchmark data
# Similar to pilot_ray_slurm_throughput.py

node_configs = [1, 2, 4, 8]
cores_per_node = 64
task_counts = [256, 512, 1024, 2048, 4096, 8192]

throughput_results = []

for nodes in node_configs:
    total_cores = nodes * cores_per_node
    for n_tasks in task_counts:
        # Simulate execution time with some overhead
        task_time = 0.001  # 1ms per task
        scheduling_overhead = 0.1 * np.log(n_tasks)  # Scheduling overhead
        communication_overhead = 0.05 * nodes  # Communication overhead
        
        # Parallel execution time
        parallel_batches = np.ceil(n_tasks / total_cores)
        execution_time = parallel_batches * task_time + scheduling_overhead + communication_overhead
        
        # Add some noise
        execution_time *= (0.9 + 0.2 * np.random.random())
        
        throughput = n_tasks / execution_time
        efficiency = throughput / total_cores
        
        throughput_results.append({
            'nodes': nodes,
            'total_cores': total_cores,
            'n_tasks': n_tasks,
            'execution_time': execution_time,
            'throughput': throughput,
            'efficiency': efficiency,
            'tasks_per_core': n_tasks / total_cores
        })

throughput_df = pd.DataFrame(throughput_results)
throughput_df.head(15)

In [None]:
# Visualize throughput scaling
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# 1. Throughput vs task count by node count
ax1 = axes[0, 0]
for nodes in node_configs:
    data = throughput_df[throughput_df['nodes'] == nodes]
    ax1.plot(data['n_tasks'], data['throughput'], 'o-', 
             label=f'{nodes} nodes ({nodes*64} cores)', linewidth=2, markersize=8)
ax1.set_xlabel('Number of Tasks')
ax1.set_ylabel('Throughput (tasks/second)')
ax1.set_title('Throughput Scaling with Task Count')
ax1.legend()
ax1.set_xscale('log', base=2)
ax1.set_yscale('log')

# 2. Strong scaling (fixed problem, more resources)
ax2 = axes[0, 1]
fixed_tasks = 4096
strong_data = throughput_df[throughput_df['n_tasks'] == fixed_tasks]
ax2.plot(strong_data['total_cores'], strong_data['throughput'], 'go-', 
         linewidth=2, markersize=10, label='Actual')

# Ideal scaling line
ideal_scaling = strong_data['throughput'].iloc[0] * strong_data['total_cores'] / strong_data['total_cores'].iloc[0]
ax2.plot(strong_data['total_cores'], ideal_scaling, 'r--', linewidth=2, label='Ideal Linear')

ax2.set_xlabel('Total Cores')
ax2.set_ylabel('Throughput (tasks/second)')
ax2.set_title(f'Strong Scaling ({fixed_tasks} tasks)')
ax2.legend()

# 3. Weak scaling (tasks proportional to resources)
ax3 = axes[1, 0]
# Select entries where tasks_per_core is approximately constant
weak_data = throughput_df[throughput_df['tasks_per_core'].between(15, 17)]
if len(weak_data) > 0:
    ax3.plot(weak_data['total_cores'], weak_data['efficiency'], 'bo-', 
             linewidth=2, markersize=10)
else:
    # Fallback: show efficiency for fixed task count
    ax3.plot(strong_data['total_cores'], strong_data['efficiency'], 'bo-', 
             linewidth=2, markersize=10)
ax3.axhline(1.0, color='red', linestyle='--', alpha=0.7, label='Ideal efficiency')
ax3.set_xlabel('Total Cores')
ax3.set_ylabel('Efficiency (throughput/core)')
ax3.set_title('Parallel Efficiency')
ax3.legend()

# 4. Execution time heatmap
ax4 = axes[1, 1]
pivot_time = throughput_df.pivot(index='nodes', columns='n_tasks', values='execution_time')
sns.heatmap(pivot_time, annot=True, fmt='.2f', cmap='RdYlGn_r', ax=ax4,
            cbar_kws={'label': 'Execution Time (s)'})
ax4.set_xlabel('Number of Tasks')
ax4.set_ylabel('Number of Nodes')
ax4.set_title('Execution Time (seconds)')

plt.tight_layout()
plt.savefig('ray_throughput_scaling.png', dpi=150, bbox_inches='tight')
plt.show()

print("\nScaling Summary:")
print(f"Max throughput: {throughput_df['throughput'].max():.0f} tasks/second")
print(f"Best config: {throughput_df.loc[throughput_df['throughput'].idxmax(), 'nodes']} nodes, "
      f"{throughput_df.loc[throughput_df['throughput'].idxmax(), 'n_tasks']} tasks")
print(f"Average efficiency: {throughput_df['efficiency'].mean():.2f} tasks/second/core")

## 6. Actor Patterns <a id="actors"></a>

Ray supports actor-based computation patterns for stateful distributed computing.

In [None]:
# Simulate actor-based parameter server pattern
# Common in distributed machine learning

class SimulatedParameterServer:
    """Simulates a parameter server actor."""
    def __init__(self, n_params):
        self.params = np.random.randn(n_params)
        self.update_count = 0
        self.history = []
    
    def get_params(self):
        return self.params.copy()
    
    def apply_gradients(self, gradients, learning_rate=0.01):
        self.params -= learning_rate * gradients
        self.update_count += 1
        self.history.append(np.linalg.norm(self.params))
        return self.params.copy()

def simulate_worker_training(worker_id, n_iterations, batch_size):
    """Simulate worker computing gradients."""
    results = []
    for i in range(n_iterations):
        # Simulate gradient computation
        gradient = np.random.randn(100) * 0.1
        loss = np.random.exponential(1.0 / (i + 1))
        results.append({
            'worker_id': worker_id,
            'iteration': i,
            'loss': loss,
            'gradient_norm': np.linalg.norm(gradient)
        })
    return results

In [None]:
# Simulate distributed training
n_workers = 4
n_iterations = 50
batch_size = 32

# Run workers
all_results = []
for worker_id in range(n_workers):
    np.random.seed(worker_id * 42)
    results = simulate_worker_training(worker_id, n_iterations, batch_size)
    all_results.extend(results)

training_df = pd.DataFrame(all_results)

# Simulate parameter server
ps = SimulatedParameterServer(100)
for _ in range(n_iterations * n_workers):
    gradients = np.random.randn(100) * 0.1
    ps.apply_gradients(gradients)

training_df.head(10)

In [None]:
# Visualize distributed training
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# 1. Loss curves per worker
ax1 = axes[0, 0]
for worker_id in range(n_workers):
    worker_data = training_df[training_df['worker_id'] == worker_id]
    ax1.plot(worker_data['iteration'], worker_data['loss'], 
             label=f'Worker {worker_id}', alpha=0.7, linewidth=2)

# Average loss
avg_loss = training_df.groupby('iteration')['loss'].mean()
ax1.plot(avg_loss.index, avg_loss.values, 'k-', linewidth=3, label='Average')

ax1.set_xlabel('Iteration')
ax1.set_ylabel('Loss')
ax1.set_title('Training Loss by Worker')
ax1.legend()
ax1.set_yscale('log')

# 2. Parameter norm evolution
ax2 = axes[0, 1]
ax2.plot(ps.history, 'b-', linewidth=2)
ax2.set_xlabel('Update Step')
ax2.set_ylabel('Parameter Norm')
ax2.set_title('Parameter Server: Parameter Norm Evolution')
ax2.axhline(ps.history[0], color='red', linestyle='--', alpha=0.5, label='Initial')
ax2.legend()

# 3. Gradient norm distribution
ax3 = axes[1, 0]
training_df.boxplot(column='gradient_norm', by='worker_id', ax=ax3)
ax3.set_xlabel('Worker ID')
ax3.set_ylabel('Gradient Norm')
ax3.set_title('Gradient Norm Distribution by Worker')
plt.suptitle('')

# 4. Work distribution pie chart
ax4 = axes[1, 1]
work_per_worker = training_df.groupby('worker_id').size()
ax4.pie(work_per_worker.values, labels=[f'Worker {i}' for i in work_per_worker.index],
        autopct='%1.1f%%', colors=plt.cm.Set2(range(n_workers)))
ax4.set_title('Work Distribution Across Workers')

plt.tight_layout()
plt.savefig('ray_actor_training.png', dpi=150, bbox_inches='tight')
plt.show()

print("\nDistributed Training Summary:")
print(f"Total iterations: {len(training_df)}")
print(f"Workers: {n_workers}")
print(f"Final average loss: {training_df.groupby('iteration')['loss'].mean().iloc[-1]:.4f}")
print(f"Parameter updates: {ps.update_count}")

## Summary

This notebook demonstrated using QTAU with Ray for:

1. **Basic task distribution** with Monte Carlo simulations
2. **Resource-aware scheduling** with CPU, memory, and GPU requirements
3. **GPU task execution** comparing CPU vs GPU performance
4. **Throughput scaling analysis** with strong and weak scaling
5. **Actor patterns** for stateful distributed computing

### Key Takeaways

- Ray excels at low-latency task execution and GPU workloads
- Resource specifications allow fine-grained control over task placement
- Actor patterns enable stateful distributed applications
- QTAU simplifies Ray cluster management on HPC systems