# EpiRust High-Performance Computing Demo

This notebook demonstrates EpiRust's high-performance computing capabilities:

1. Parallel processing with Rayon
2. SIMD optimizations
3. Memory optimization techniques
4. NUMA-aware computations

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from time import perf_counter
from epirust.parallel import ParallelExecutor
from epirust.compute.simd import SimdOperations
from epirust.memory import MemoryOptimizer

# Set random seed and plotting style
np.random.seed(42)
plt.style.use('seaborn')
sns.set_palette("husl")

## 1. Parallel Processing with Rayon

Let's compare parallel vs sequential processing for large-scale computations:

In [None]:
def benchmark_parallel_processing(sizes):
    executor = ParallelExecutor()
    results = []
    
    for size in sizes:
        # Generate random data
        data = np.random.random((size, 100))
        
        # Sequential execution
        start = perf_counter()
        seq_result = executor.process_sequential(data)
        seq_time = perf_counter() - start
        
        # Parallel execution
        start = perf_counter()
        par_result = executor.process_parallel(data)
        par_time = perf_counter() - start
        
        results.append({
            'size': size,
            'sequential_time': seq_time,
            'parallel_time': par_time,
            'speedup': seq_time / par_time
        })
    
    return pd.DataFrame(results)

# Run benchmarks
sizes = [1000, 10000, 100000, 1000000]
parallel_results = benchmark_parallel_processing(sizes)

# Plot results
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.loglog(parallel_results['size'], parallel_results['sequential_time'],
         marker='o', label='Sequential')
plt.loglog(parallel_results['size'], parallel_results['parallel_time'],
         marker='s', label='Parallel')
plt.xlabel('Data Size')
plt.ylabel('Time (seconds)')
plt.title('Processing Time vs Data Size')
plt.legend()
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
plt.semilogx(parallel_results['size'], parallel_results['speedup'],
           marker='o')
plt.xlabel('Data Size')
plt.ylabel('Speedup Factor')
plt.title('Parallel Processing Speedup')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 2. SIMD Optimizations

Compare performance across different SIMD instruction sets:

In [None]:
def benchmark_simd_operations(sizes):
    simd_ops = SimdOperations()
    results = []
    
    for size in sizes:
        data = np.random.random(size)
        
        # Measure times for different implementations
        times = {}
        for impl in ['scalar', 'sse2', 'avx2', 'avx512']:
            if simd_ops.has_capability(impl):
                start = perf_counter()
                _ = simd_ops.vector_sum(data, impl)
                times[impl] = perf_counter() - start
        
        results.append({'size': size, **times})
    
    return pd.DataFrame(results)

# Run SIMD benchmarks
sizes = [1000, 10000, 100000, 1000000]
simd_results = benchmark_simd_operations(sizes)

# Plot SIMD performance comparison
plt.figure(figsize=(10, 6))
for col in simd_results.columns:
    if col != 'size':
        plt.loglog(simd_results['size'], simd_results[col],
                 marker='o', label=col.upper())

plt.xlabel('Vector Size')
plt.ylabel('Time (seconds)')
plt.title('SIMD Implementation Performance')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

## 3. Memory Optimization

Demonstrate memory-efficient processing techniques:

In [None]:
def benchmark_memory_optimization(sizes):
    optimizer = MemoryOptimizer()
    results = []
    
    for size in sizes:
        # Generate test data
        data = np.random.random((size, 50))
        
        # Standard processing
        start = perf_counter()
        std_result = optimizer.process_standard(data)
        std_time = perf_counter() - start
        std_mem = optimizer.measure_memory_usage()
        
        # Optimized processing
        start = perf_counter()
        opt_result = optimizer.process_optimized(data)
        opt_time = perf_counter() - start
        opt_mem = optimizer.measure_memory_usage()
        
        results.append({
            'size': size,
            'standard_time': std_time,
            'optimized_time': opt_time,
            'standard_memory': std_mem,
            'optimized_memory': opt_mem
        })
    
    return pd.DataFrame(results)

# Run memory optimization benchmarks
sizes = [1000, 10000, 100000]
memory_results = benchmark_memory_optimization(sizes)

# Plot memory usage and performance
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

# Memory usage
ax1.loglog(memory_results['size'], memory_results['standard_memory'],
         marker='o', label='Standard')
ax1.loglog(memory_results['size'], memory_results['optimized_memory'],
         marker='s', label='Optimized')
ax1.set_xlabel('Data Size')
ax1.set_ylabel('Memory Usage (MB)')
ax1.set_title('Memory Usage Comparison')
ax1.legend()
ax1.grid(True, alpha=0.3)

# Processing time
ax2.loglog(memory_results['size'], memory_results['standard_time'],
         marker='o', label='Standard')
ax2.loglog(memory_results['size'], memory_results['optimized_time'],
         marker='s', label='Optimized')
ax2.set_xlabel('Data Size')
ax2.set_ylabel('Time (seconds)')
ax2.set_title('Processing Time Comparison')
ax2.legend()
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 4. NUMA-Aware Processing

Demonstrate NUMA-aware computations for multi-socket systems:

In [None]:
def benchmark_numa_processing(sizes):
    results = []
    
    for size in sizes:
        data = np.random.random((size, 100))
        
        # Standard parallel processing
        start = perf_counter()
        _ = ParallelExecutor.process_standard(data)
        std_time = perf_counter() - start
        
        # NUMA-aware processing
        start = perf_counter()
        _ = ParallelExecutor.process_numa_aware(data)
        numa_time = perf_counter() - start
        
        results.append({
            'size': size,
            'standard_time': std_time,
            'numa_time': numa_time,
            'speedup': std_time / numa_time
        })
    
    return pd.DataFrame(results)

# Run NUMA benchmarks if system supports it
if ParallelExecutor.has_numa_support():
    sizes = [10000, 100000, 1000000]
    numa_results = benchmark_numa_processing(sizes)
    
    # Plot NUMA performance comparison
    plt.figure(figsize=(10, 6))
    plt.semilogx(numa_results['size'], numa_results['speedup'],
               marker='o')
    plt.xlabel('Data Size')
    plt.ylabel('Speedup Factor')
    plt.title('NUMA-Aware Processing Speedup')
    plt.grid(True, alpha=0.3)
    plt.show()
else:
    print("NUMA support not detected on this system")

## Performance Summary

Let's summarize the performance gains from different optimizations:

In [None]:
# Collect maximum speedups
summary = {
    'Parallel Processing': parallel_results['speedup'].max(),
    'SIMD (vs Scalar)': simd_results.iloc[-1]['avx512'] / 
                        simd_results.iloc[-1]['scalar'] 
                        if 'avx512' in simd_results.columns else 'N/A',
    'Memory Optimization': (memory_results['standard_memory'] / 
                          memory_results['optimized_memory']).max(),
    'NUMA-Aware': numa_results['speedup'].max() if 'numa_results' in locals() else 'N/A'
}

print("Maximum Performance Improvements:")
for technique, speedup in summary.items():
    if speedup != 'N/A':
        print(f"{technique}: {speedup:.2f}x")
    else:
        print(f"{technique}: Not Available")

## Conclusion

This notebook demonstrated EpiRust's high-performance computing capabilities:

1. Parallel processing with Rayon provides significant speedup for large datasets
2. SIMD optimizations offer additional performance gains for vector operations
3. Memory optimization techniques reduce memory usage while maintaining performance
4. NUMA-aware processing improves performance on multi-socket systems

Key takeaways:
- Performance gains scale with data size
- Different optimization techniques can be combined
- Hardware-specific optimizations (SIMD, NUMA) provide additional benefits when available