In [6]:
import numpy as np
from scipy.sparse import linalg as LA
from scipy.sparse import csr_matrix
import time
import matplotlib.pyplot as plt
import random
import sys
import tracemalloc
import gc

try:
    import cupy as cp
    from cupyx.scipy.sparse import linalg as cupy_LA
    from cupyx.scipy.sparse import csr_matrix as cupy_csr_matrix
    CUPY_AVAILABLE = True
    print("CuPy is available! GPU acceleration enabled.")

    mempool = cp.get_default_memory_pool()
    pinned_mempool = cp.get_default_pinned_memory_pool()
    
    # Check GPU memory
    free_mem, total_mem = cp.cuda.runtime.memGetInfo()
    print(f"GPU Memory: {free_mem/1024**3:.2f}GB free / {total_mem/1024**3:.2f}GB total")
    
    GPU_MIN_SIZE = 100  # Threshold for GPU usage
    print(f"GPU threshold set to: {GPU_MIN_SIZE}")
    
except ImportError:
    CUPY_AVAILABLE = False
    GPU_MIN_SIZE = float('inf')
    print("CuPy not available. Only CPU version will work.")

# Global verbosity flag - simple boolean
VERBOSE = False

def set_verbose(verbose=True):
    """Set global verbosity"""
    global VERBOSE
    VERBOSE = verbose

# ============================================================================
# GPU DATA MANAGER
# ============================================================================
class GPUDataManager:
    """
    Manages data lifecycle on GPU to minimize CPU-GPU transfers.
    """
    def __init__(self):
        self.gpu_cache = {}
        self.transfer_count = 0
        self.transfer_bytes = 0
    
    def get_gpu_matrix(self, matrix, key=None):
        """Get matrix on GPU, using cache when possible"""
        if not CUPY_AVAILABLE:
            return matrix
        
        if key and key in self.gpu_cache:
            return self.gpu_cache[key]
        
        if isinstance(matrix, cp.ndarray):
            return matrix
        
        # Transfer to GPU
        if hasattr(matrix, 'toarray'):
            gpu_matrix = cp.asarray(matrix.toarray())
        else:
            gpu_matrix = cp.asarray(matrix)
        
        self.transfer_count += 1
        self.transfer_bytes += gpu_matrix.nbytes
        
        if key:
            self.gpu_cache[key] = gpu_matrix
        
        return gpu_matrix
    
    def clear_cache(self):
        """Clear GPU cache and free memory"""
        self.gpu_cache.clear()
        if CUPY_AVAILABLE:
            mempool = cp.get_default_memory_pool()
            mempool.free_all_blocks()
    
    def get_stats(self):
        """Get transfer statistics"""
        return {
            'transfers': self.transfer_count,
            'bytes': self.transfer_bytes,
            'mb': self.transfer_bytes / (1024**2)
        }

# Global GPU data manager
gpu_manager = GPUDataManager()

# ============================================================================
# CORE ALGORITHMS
# ============================================================================
def bicut_group_gpu_native(L_gpu):
    """
    Pure GPU implementation - all operations stay on GPU until final result.
    """
    n = L_gpu.shape[0]
    
    try:
        # Eigenvalue computation on GPU
        if cp.sparse.issparse(L_gpu):
            _, eigenvecs = cupy_LA.eigsh(L_gpu, k=2, which='SA')
        else:
            eigenvals, eigenvecs = cp.linalg.eigh(L_gpu)
            idx = cp.argsort(eigenvals)
            eigenvecs = eigenvecs[:, idx[:2]]
        
        # Fiedler vector processing - all on GPU
        fiedler_vector = eigenvecs[:, 1]
        sorted_args = cp.argsort(fiedler_vector)
        
        # Matrix reordering - stay on GPU
        adj = -L_gpu[cp.ix_(sorted_args, sorted_args)]
        
        # Optimized cut finding - fully vectorized on GPU
        if n < 1000:
            upper_tri_sums = cp.zeros(n-1)
            for i in range(1, n):
                upper_tri_sums[i-1] = cp.sum(adj[i:, :i])
        else:
            # Memory-efficient approach for large matrices
            upper_tri_sums = cp.zeros(n-1)
            batch_size = min(200, n // 5)
            for start in range(1, n, batch_size):
                end = min(start + batch_size, n)
                for i in range(start, end):
                    upper_tri_sums[i-1] = cp.sum(adj[i:, :i])
        
        # Quality computation on GPU
        ind = cp.arange(1, n, dtype=cp.float64)
        qualities = upper_tri_sums / (ind * (n - ind))
        best_cut = cp.argmin(qualities) + 1
        
        # Transfer minimal data back to CPU
        sorted_args_cpu = cp.asnumpy(sorted_args)
        best_cut_cpu = int(cp.asnumpy(best_cut))
        
        # Final grouping on CPU
        first_group = sorted_args_cpu[:best_cut_cpu]
        second_group = sorted_args_cpu[best_cut_cpu:]
        
        if 0 in first_group:
            return first_group.tolist(), second_group.tolist()
        return second_group.tolist(), first_group.tolist()
        
    except Exception as e:
        raise RuntimeError(f"GPU computation failed: {e}")

def bicut_group(L, use_gpu=False):
    """
    Enhanced spectral clustering with minimal transfer strategy.
    """
    n = L.shape[0]

    # Basis steps
    if n == 0:
        raise ValueError("The Laplacian matrix is empty.") 
    if n == 1:
        return [0], []
    if n == 2:
        return [0], [1]
    
    # Decision: Use GPU for matrices above threshold
    should_use_gpu = use_gpu and CUPY_AVAILABLE and n >= GPU_MIN_SIZE
    
    if should_use_gpu:
        try:
            # Transfer to GPU once
            L_gpu = gpu_manager.get_gpu_matrix(L)
            return bicut_group_gpu_native(L_gpu)
        except Exception as e:
            if VERBOSE:
                print(f"GPU failed: {str(e)[:50]}..., using CPU fallback")
            return _bicut_group_cpu(L)
    else:
        return _bicut_group_cpu(L)

def _bicut_group_cpu(L):
    """Original CPU implementation"""
    n = L.shape[0]
    
    # Handle CuPy arrays that need to be transferred
    if isinstance(L, cp.ndarray):
        L = cp.asnumpy(L)
    elif hasattr(L, 'toarray') and hasattr(L, 'get'):
        L = L.get().toarray()
    elif hasattr(L, 'toarray'):
        L = L.toarray()
    
    # Get Fiedler vector and sort vertices
    _, eigenvecs = LA.eigsh(L, k=2, which='SA')
    fiedler_vector = eigenvecs[:, 1]
    sorted_args = np.argsort(fiedler_vector)
    
    # Reorder adjacency matrix
    adj = -L[np.ix_(sorted_args, sorted_args)]
    
    # Find best cut
    ind = np.arange(1, n)
    upper_tri_sums = np.array([np.sum(adj[i:, :i]) for i in ind])
    qualities = upper_tri_sums / (ind * (n - ind))
    
    best_cut = np.argmin(qualities) + 1
    
    # Get the groups based on sorted indices
    first_group = sorted_args[:best_cut]
    second_group = sorted_args[best_cut:]

    if 0 in first_group:
        return first_group.tolist(), second_group.tolist()
    return second_group.tolist(), first_group.tolist()

# ============================================================================
# TREE STRUCTURE
# ============================================================================
class BiCutNode:
    """Node class for the bi-cut tree structure"""
    def __init__(self, indices, left=None, right=None, parent=None):
        self.indices = indices
        self.left = left
        self.right = right

    def is_leaf(self):
        return self.left is None and self.right is None
    
    def get_order(self):
        """Get order of singleton vertices"""
        order = []
        
        def collect_singletons(node):
            if node.is_leaf():
                order.extend(node.indices)
            else:
                if node.left:
                    collect_singletons(node.left)
                if node.right:
                    collect_singletons(node.right)
        
        collect_singletons(self)
        return order        
    
    def print_fancy_tree(self, prefix="", is_last=True, is_root=True):
        """Print tree with fancy box-drawing characters"""
        if is_root:
            print("BiCut Tree Structure")
        
        connector = "├─" if is_root else ("└─" if is_last else "├─")
        indices_str = f"[{', '.join(map(str, sorted(self.indices)))}]"
        
        print(f"{prefix}{connector} {indices_str}")
        
        new_prefix = prefix + ("│  " if is_root else ("   " if is_last else "│  "))
        children = [child for child in [self.left, self.right] if child is not None]
        
        for i, child in enumerate(children):
            is_last_child = (i == len(children) - 1)
            child.print_fancy_tree(new_prefix, is_last_child, False)

# ============================================================================
# TREE BUILDERS
# ============================================================================
def treebuilder_gpu_batch(laplacian_matrix, thre=None, use_gpu=True):
    """
    GPU-optimized tree builder that minimizes transfers by processing in batches.
    """
    n = laplacian_matrix.shape[0]
    
    if not use_gpu or not CUPY_AVAILABLE:
        if VERBOSE:
            print("GPU disabled by user or CuPy unavailable")
        return treebuilder_cpu(laplacian_matrix, thre, list(range(n)))
    
    if n < GPU_MIN_SIZE:
        if VERBOSE:
            print(f"Matrix too small for GPU ({n} < {GPU_MIN_SIZE})")
        return treebuilder_cpu(laplacian_matrix, thre, list(range(n)))
    
    if VERBOSE:
        print(f"Using GPU batch processing for matrix size {n}x{n}")
    
    # Transfer matrix to GPU once at the beginning
    L_gpu = gpu_manager.get_gpu_matrix(laplacian_matrix, key='main_matrix')
    
    def build_subtree_gpu(gpu_matrix, indices, depth=0):
        """Recursive function that works primarily on GPU"""
        n_sub = len(indices)
        
        # Base cases
        if n_sub == 0:
            raise ValueError("Empty matrix")
        if n_sub == 1:
            return BiCutNode(indices)
        if n_sub == 2:
            return BiCutNode(indices, BiCutNode([indices[0]]), BiCutNode([indices[1]]))
        if thre is not None and n_sub <= thre:
            return BiCutNode(indices)
        
        # For large submatrices at shallow depth, use GPU
        gpu_threshold = max(30, GPU_MIN_SIZE // (2 ** depth))
        should_use_gpu_here = n_sub >= gpu_threshold and depth < 6
        
        if should_use_gpu_here:
            try:
                # Create submatrix indices on GPU
                sub_indices = cp.arange(n_sub)
                
                # Extract submatrix (stays on GPU)
                submatrix_gpu = gpu_matrix[cp.ix_(sub_indices, sub_indices)]
                
                # Process on GPU using our GPU-native function
                first_local, second_local = bicut_group_gpu_native(submatrix_gpu)
                
                # Convert back to global indices
                first_group = [indices[i] for i in first_local]
                second_group = [indices[i] for i in second_local]
                
                if not second_group:
                    return BiCutNode(indices)
                
                # Create node and recurse
                node = BiCutNode(indices)
                
                # Create submatrices for recursion (stay on GPU)
                first_sub_gpu = gpu_matrix[cp.ix_(cp.asarray(first_local), cp.asarray(first_local))]
                second_sub_gpu = gpu_matrix[cp.ix_(cp.asarray(second_local), cp.asarray(second_local))]
                
                node.left = build_subtree_gpu(first_sub_gpu, first_group, depth + 1)
                node.right = build_subtree_gpu(second_sub_gpu, second_group, depth + 1)
                
                return node
                
            except Exception as e:
                if VERBOSE:
                    print(f"GPU processing failed at depth {depth}: {e}")
                # Transfer submatrix to CPU and continue
                submatrix_cpu = cp.asnumpy(gpu_matrix[:n_sub, :n_sub])
                return treebuilder_cpu(submatrix_cpu, thre, indices)
        
        else:
            # For small submatrices, transfer to CPU
            submatrix_cpu = cp.asnumpy(gpu_matrix[:n_sub, :n_sub])
            return treebuilder_cpu(submatrix_cpu, thre, indices)
    
    try:
        result = build_subtree_gpu(L_gpu, list(range(n)))
        if VERBOSE:
            stats = gpu_manager.get_stats()
            print(f"GPU processing complete. Transfers: {stats['transfers']}, Data: {stats['mb']:.1f}MB")
        return result
    
    except Exception as e:
        if VERBOSE:
            print(f"GPU batch processing failed: {e}")
        gpu_manager.clear_cache()
        return treebuilder_cpu(laplacian_matrix, thre, list(range(n)))

def treebuilder_cpu(laplacian_matrix, thre=None, indices=None):
    """CPU-only tree builder (original algorithm)"""
    if indices is None:
        indices = list(range(laplacian_matrix.shape[0]))
    
    n = len(indices)
    
    if n == 0:
        raise ValueError("Empty matrix")
    if n == 1:
        return BiCutNode(indices)
    if n == 2:
        return BiCutNode(indices, BiCutNode([indices[0]]), BiCutNode([indices[1]]))
    if thre is not None and n <= thre:
        return BiCutNode(indices)
    
    # Extract submatrix
    submatrix = laplacian_matrix[np.ix_(range(n), range(n))]
    
    # Apply bicut - explicitly disable GPU to avoid recursion issues
    first_group_local, second_group_local = bicut_group(submatrix, use_gpu=False)
    
    # Convert to global indices
    first_group = [indices[i] for i in first_group_local]
    second_group = [indices[i] for i in second_group_local]
    
    if not second_group:
        return BiCutNode(indices)
    
    # Create node and recurse
    node = BiCutNode(indices)
    
    first_submatrix = laplacian_matrix[np.ix_(first_group_local, first_group_local)]
    second_submatrix = laplacian_matrix[np.ix_(second_group_local, second_group_local)]
    
    node.left = treebuilder_cpu(first_submatrix, thre, first_group)
    node.right = treebuilder_cpu(second_submatrix, thre, second_group)
    
    return node

def treebuilder(laplacian_matrix, thre=None, indices=None, use_gpu=False):
    """
    Main treebuilder function with transfer-optimized GPU acceleration.
    """
    # Clear any previous GPU cache
    gpu_manager.clear_cache()
    
    n = laplacian_matrix.shape[0]
    if VERBOSE:
        print(f"Building tree for {n}x{n} matrix...")
    
    start_time = time.time()
    
    if use_gpu and CUPY_AVAILABLE:
        result = treebuilder_gpu_batch(laplacian_matrix, thre)
    else:
        result = treebuilder_cpu(laplacian_matrix, thre, indices)
    
    elapsed = time.time() - start_time
    if VERBOSE:
        print(f"Tree built in {elapsed:.3f}s")
    
    return result

# ============================================================================
# STRUCTURED MATRIX GENERATION
# ============================================================================
def generate_layers_groups_graph(num_supergroups=3, 
                                 num_subgroups_per_supergroup=4, 
                                 nodes_per_subgroup=10,
                                 p_intra_subgroup=0.8,
                                 p_intra_supergroup=0.3,
                                 p_inter_supergroup=0.05,
                                 seed=None):
    """
    Generate a hierarchical graph with super-groups and sub-groups structure.
    """
    if seed is not None:
        np.random.seed(seed)
    
    total_subgroups = num_supergroups * num_subgroups_per_supergroup
    total_nodes = total_subgroups * nodes_per_subgroup
    
    # Initialize adjacency matrix
    adj = np.zeros((total_nodes, total_nodes))
    
    # Create edges
    for i in range(total_nodes):
        for j in range(i + 1, total_nodes):
            # Determine which groups nodes belong to
            subgroup_i = i // nodes_per_subgroup
            subgroup_j = j // nodes_per_subgroup
            supergroup_i = subgroup_i // num_subgroups_per_supergroup
            supergroup_j = subgroup_j // num_subgroups_per_supergroup
            
            # Determine edge probability
            if subgroup_i == subgroup_j:
                p = p_intra_subgroup
            elif supergroup_i == supergroup_j:
                p = p_intra_supergroup
            else:
                p = p_inter_supergroup
            
            # Create edge with probability p
            if np.random.random() < p:
                adj[i, j] = 1
                adj[j, i] = 1
    
    # Compute Laplacian
    degrees = np.sum(adj, axis=1)
    L = np.diag(degrees) - adj
    
    return L

def generate_block_diagonal_graph(block_sizes, p_intra=0.8, p_inter=0.05, seed=None):
    """
    Generate a graph with block diagonal structure.
    """
    if seed is not None:
        np.random.seed(seed)
    
    n = sum(block_sizes)
    adj = np.zeros((n, n))
    
    # Determine block boundaries
    block_starts = [0]
    for size in block_sizes[:-1]:
        block_starts.append(block_starts[-1] + size)
    
    # Create edges
    for i in range(n):
        for j in range(i + 1, n):
            # Find which blocks i and j belong to
            block_i = 0
            block_j = 0
            for k, start in enumerate(block_starts):
                if i >= start:
                    block_i = k
                if j >= start:
                    block_j = k
            
            # Set edge probability
            p = p_intra if block_i == block_j else p_inter
            
            if np.random.random() < p:
                adj[i, j] = 1
                adj[j, i] = 1
    
    # Compute Laplacian
    degrees = np.sum(adj, axis=1)
    L = np.diag(degrees) - adj
    
    return L

def generate_random_graph(n, density=0.1, seed=None):
    """
    Generate a random Erdős–Rényi graph.
    """
    if seed is not None:
        np.random.seed(seed)
    
    adj = np.random.random((n, n)) < density
    adj = adj.astype(float)
    adj = (adj + adj.T) / 2
    np.fill_diagonal(adj, 0)
    
    degrees = np.sum(adj, axis=1)
    L = np.diag(degrees) - adj
    
    return L

# ============================================================================
# PERFORMANCE TESTING
# ============================================================================
def test_duration_memory(laplacian_matrix, thre=None, use_gpu=False):
    """
    Test the duration and memory usage of treebuilder.
    """
    # Clear caches
    gc.collect()
    if CUPY_AVAILABLE:
        gpu_manager.clear_cache()
        if use_gpu:
            cp.get_default_memory_pool().free_all_blocks()
    
    # Start monitoring
    tracemalloc.start()
    start_time = time.time()
    
    # Run treebuilder
    result = treebuilder(laplacian_matrix, thre=thre, use_gpu=use_gpu)
    
    # Get metrics
    duration = time.time() - start_time
    current, peak = tracemalloc.get_traced_memory()
    tracemalloc.stop()
    
    peak_memory_mb = peak / (1024 * 1024)
    memory_ratio = current / peak if peak > 0 else 0
    
    return duration, peak_memory_mb, memory_ratio

def parallel_choices_test(iter_count=10, sup=2, sub=3, node=5, test_gpu=True):
    """
    Test performance with different matrix structures and parameters.
    """
    import csv
    
    # Prepare CSV file
    csv_filename = 'parallel_test_results.csv'
    print(f"\n{'='*70}")
    print(f"STRUCTURED MATRIX PERFORMANCE TEST")
    print(f"Results will be saved to: {csv_filename}")
    print(f"Testing {'CPU and GPU' if test_gpu and CUPY_AVAILABLE else 'CPU only'}")
    print("="*70)
    
    with open(csv_filename, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        # Write header
        header = ['iteration', 'matrix_type', 'total_nodes', 'structure',
                  'cpu_duration_thre', 'cpu_memory_thre', 'cpu_ratio_thre', 
                  'cpu_duration_no_thre', 'cpu_memory_no_thre', 'cpu_ratio_no_thre']
        
        if test_gpu and CUPY_AVAILABLE:
            header.extend(['gpu_duration_thre', 'gpu_memory_thre', 'gpu_ratio_thre',
                          'gpu_duration_no_thre', 'gpu_memory_no_thre', 'gpu_ratio_no_thre',
                          'speedup_thre', 'speedup_no_thre'])
        
        writer.writerow(header)
        
        for i in range(1, iter_count + 1):
            total_nodes = i * sup * sub * node
            print(f"\nIteration {i}/{iter_count} (total nodes: {total_nodes})")
            
            # Test different matrix structures
            test_configs = [
                ('supergroups', i*sup, sub, node, f"{i*sup}x{sub}x{node}"),
                ('subgroups', sup, i*sub, node, f"{sup}x{i*sub}x{node}"),
                ('nodes_per_sub', sup, sub, i*node, f"{sup}x{sub}x{i*node}")
            ]
            
            for matrix_type, s, sb, n, structure in test_configs:
                print(f"  Testing {matrix_type}: {structure}")
                
                # Generate structured matrix
                L = generate_layers_groups_graph(
                    num_supergroups=s,
                    num_subgroups_per_supergroup=sb,
                    nodes_per_subgroup=n,
                    p_intra_subgroup=0.8,
                    p_intra_supergroup=0.3,
                    p_inter_supergroup=0.05,
                    seed=42
                )
                
                total = s * sb * n
                thre_value = n if matrix_type == 'nodes_per_sub' else node
                
                # Test CPU version
                print(f"    CPU tests...", end='')
                cpu_thre = test_duration_memory(L, thre=thre_value, use_gpu=False)
                cpu_no_thre = test_duration_memory(L, thre=1, use_gpu=False)
                print(f" done (thre={thre_value}: {cpu_thre[0]:.2f}s, thre=1: {cpu_no_thre[0]:.2f}s)")
                
                row = [i, matrix_type, total, structure,
                       cpu_thre[0], cpu_thre[1], cpu_thre[2],
                       cpu_no_thre[0], cpu_no_thre[1], cpu_no_thre[2]]
                
                # Test GPU version if available
                if test_gpu and CUPY_AVAILABLE and total >= GPU_MIN_SIZE:
                    print(f"    GPU tests...", end='')
                    gpu_thre = test_duration_memory(L, thre=thre_value, use_gpu=True)
                    gpu_no_thre = test_duration_memory(L, thre=1, use_gpu=True)
                    
                    speedup_thre = cpu_thre[0] / gpu_thre[0] if gpu_thre[0] > 0 else 0
                    speedup_no_thre = cpu_no_thre[0] / gpu_no_thre[0] if gpu_no_thre[0] > 0 else 0
                    
                    print(f" done (speedup: {speedup_thre:.2f}x with thre, {speedup_no_thre:.2f}x without)")
                    
                    row.extend([gpu_thre[0], gpu_thre[1], gpu_thre[2],
                               gpu_no_thre[0], gpu_no_thre[1], gpu_no_thre[2],
                               speedup_thre, speedup_no_thre])
                elif test_gpu and CUPY_AVAILABLE:
                    print(f"    GPU skipped (matrix too small: {total} < {GPU_MIN_SIZE})")
                    row.extend([None] * 8)
                
                writer.writerow(row)
                csvfile.flush()
            
            if i % 5 == 0:
                print(f"  Completed {i}/{iter_count} iterations")
    
    print(f"\nTest completed! Results saved to {csv_filename}")

# ============================================================================
# BENCHMARKING
# ============================================================================
def benchmark_transfer_analysis(laplacian_matrix, thre=None, num_runs=3):
    """
    Benchmark with detailed transfer analysis.
    """
    n = laplacian_matrix.shape[0]
    print(f"\n{'='*70}")
    print(f"TRANSFER ANALYSIS BENCHMARK - Matrix {n}x{n}")
    print(f"GPU_MIN_SIZE threshold: {GPU_MIN_SIZE}")
    print(f"CuPy available: {CUPY_AVAILABLE}")
    print(f"Matrix qualifies for GPU: {n >= GPU_MIN_SIZE}")
    print("="*70)
    
    results = {}
    
    # CPU benchmark
    print("\nCPU VERSION:")
    cpu_times = []
    for i in range(num_runs):
        gpu_manager.clear_cache()
        print(f"  CPU Run {i+1}...", end='')
        start = time.time()
        cpu_result = treebuilder(laplacian_matrix, thre=thre, use_gpu=False)
        end = time.time()
        cpu_times.append(end - start)
        print(f" {cpu_times[-1]:.3f}s")
    
    cpu_avg = np.mean(cpu_times)
    print(f"CPU Average: {cpu_avg:.3f}s")
    
    # GPU benchmark if available
    if CUPY_AVAILABLE and n >= GPU_MIN_SIZE:
        print(f"\nGPU VERSION:")
        
        gpu_times = []
        transfer_stats = []
        
        for i in range(num_runs):
            gpu_manager.clear_cache()
            gpu_manager.transfer_count = 0
            gpu_manager.transfer_bytes = 0
            
            print(f"  GPU Run {i+1}...", end='')
            start = time.time()
            gpu_result = treebuilder(laplacian_matrix, thre=thre, use_gpu=True)
            cp.cuda.Stream.null.synchronize()
            end = time.time()
            
            current_stats = gpu_manager.get_stats()
            
            gpu_times.append(end - start)
            transfer_stats.append(current_stats.copy())
            
            print(f" {gpu_times[-1]:.3f}s, Transfers: {transfer_stats[-1]['transfers']}, Data: {transfer_stats[-1]['mb']:.1f}MB")
        
        gpu_avg = np.mean(gpu_times)
        avg_transfers = np.mean([s['transfers'] for s in transfer_stats])
        avg_mb = np.mean([s['mb'] for s in transfer_stats])
        
        print(f"GPU Average: {gpu_avg:.3f}s")
        print(f"Avg Transfers: {avg_transfers:.1f}")
        print(f"Avg Data: {avg_mb:.1f}MB")
        
        speedup = cpu_avg / gpu_avg if gpu_avg > 0 else 0
        
        print(f"\n{'='*50}")
        print("ANALYSIS:")
        print(f"  Speedup: {speedup:.2f}x")
        
        if avg_transfers == 0:
            print("  WARNING: No GPU transfers detected!")
        else:
            print(f"  GPU successfully used: {avg_transfers:.0f} transfers, {avg_mb:.1f}MB")
        
        # Verify correctness
        try:
            cpu_order = cpu_result.get_order()
            gpu_order = gpu_result.get_order()
            identical = cpu_order == gpu_order
            print(f"  Results identical: {identical}")
            if not identical:
                print("  WARNING: GPU and CPU results differ!")
        except Exception as e:
            print(f"  Could not verify results: {e}")
        
        results = {
            'cpu_time': cpu_avg,
            'gpu_time': gpu_avg,
            'speedup': speedup,
            'transfers': avg_transfers,
            'transfer_mb': avg_mb,
            'gpu_actually_used': avg_transfers > 0
        }
    
    else:
        print(f"\nGPU not available or matrix too small")
        results = {'cpu_time': cpu_avg, 'gpu_time': None}
    
    gpu_manager.clear_cache()
    return results

CuPy is available! GPU acceleration enabled.
GPU Memory: 10.79GB free / 12.00GB total
GPU threshold set to: 100


In [10]:
if __name__ == "__main__":
    print("="*80)
    print("TRANSFER-OPTIMIZED BICUT WITH CUPY")
    print("="*80)
    
    # Quick demo
    print("\nDEMO: STRUCTURED MATRIX TESTS")
    print("-"*60)
    
    # Test hierarchical community structure
    print("\nTest 1: Hierarchical Community Structure")
    print("  Structure: 2 supergroups x 3 subgroups x 20 nodes")
    L_hier = generate_layers_groups_graph(
        num_supergroups=10,
        num_subgroups_per_supergroup=1,
        nodes_per_subgroup=20,
        p_intra_subgroup=0.8,
        p_intra_supergroup=0.3,
        p_inter_supergroup=0.05,
        seed=42
    )
    print(f"  Matrix size: {L_hier.shape[0]}x{L_hier.shape[0]}")
    
    print("  CPU Performance:", end='')
    cpu_time, cpu_mem, _ = test_duration_memory(L_hier, thre=10, use_gpu=False)
    print(f" Time: {cpu_time:.3f}s, Memory: {cpu_mem:.1f}MB")
    
    if CUPY_AVAILABLE and L_hier.shape[0] >= GPU_MIN_SIZE:
        print("  GPU Performance:", end='')
        gpu_time, gpu_mem, _ = test_duration_memory(L_hier, thre=10, use_gpu=True)
        print(f" Time: {gpu_time:.3f}s, Memory: {gpu_mem:.1f}MB, Speedup: {cpu_time/gpu_time:.2f}x")
    
    # Run benchmark
    print("\nRunning benchmark with larger matrix...")
    L_test = generate_random_graph(500, density=0.05, seed=42)
    results = benchmark_transfer_analysis(L_test, thre=10, num_runs=2)
    
    gpu_manager.clear_cache()

TRANSFER-OPTIMIZED BICUT WITH CUPY

DEMO: STRUCTURED MATRIX TESTS
------------------------------------------------------------

Test 1: Hierarchical Community Structure
  Structure: 2 supergroups x 3 subgroups x 20 nodes
  Matrix size: 2000x2000
  CPU Performance: Time: 2.447s, Memory: 168.9MB
  GPU Performance: Time: 4.413s, Memory: 9.1MB, Speedup: 0.55x

Running benchmark with larger matrix...

TRANSFER ANALYSIS BENCHMARK - Matrix 500x500
GPU_MIN_SIZE threshold: 100
CuPy available: True
Matrix qualifies for GPU: True

CPU VERSION:
  CPU Run 1... 0.455s
  CPU Run 2... 0.451s
CPU Average: 0.453s

GPU VERSION:
  GPU Run 1... 0.749s, Transfers: 1, Data: 1.9MB
  GPU Run 2... 0.651s, Transfers: 1, Data: 1.9MB
GPU Average: 0.700s
Avg Transfers: 1.0
Avg Data: 1.9MB

ANALYSIS:
  Speedup: 0.65x
  GPU successfully used: 1 transfers, 1.9MB
  Results identical: False

Usage:
  set_verbose(True)  # Enable verbose output
  set_verbose(False) # Disable verbose output (default)
  
  # Run parallel te