In [None]:
import cudaq
import numpy as np
from scipy.optimize import minimize
from typing import List, Tuple, Optional
from dataclasses import dataclass
from collections import deque
import time
import matplotlib.pyplot as plt

# ============================================================
# GPU ACCELERATION: Import CuPy (falls back to NumPy if unavailable)
# ============================================================
try:
    import cupy as cp
    GPU_AVAILABLE = True
    print("[GPU] CuPy detected - GPU acceleration enabled")
except ImportError:
    import numpy as cp  # Fallback to NumPy
    GPU_AVAILABLE = False
    print("[CPU] CuPy not found - using CPU (install with: pip install cupy-cuda12x)")


# ============================================================
# GPU-OPTIMIZED ENERGY COMPUTATION
# ============================================================
def compute_energy_fft_gpu(sequence: np.ndarray) -> float:
    """GPU-accelerated FFT energy computation."""
    n = len(sequence)
    
    if GPU_AVAILABLE:
        # Transfer to GPU
        seq_gpu = cp.asarray(sequence, dtype=cp.float32)
        padded = cp.zeros(2 * n, dtype=cp.float32)
        padded[:n] = seq_gpu
        
        # FFT on GPU
        fft_result = cp.fft.fft(padded)
        power_spectrum = cp.abs(fft_result) ** 2
        autocorr = cp.fft.ifft(power_spectrum).real
        autocorr = autocorr[:n]
        energy = cp.sum(autocorr[1:] ** 2)
        
        # Transfer back to CPU
        return float(energy.get())
    else:
        # CPU fallback
        padded = np.zeros(2 * n)
        padded[:n] = sequence
        fft_result = np.fft.fft(padded)
        power_spectrum = np.abs(fft_result) ** 2
        autocorr = np.fft.ifft(power_spectrum).real[:n]
        return float(np.sum(autocorr[1:] ** 2))


def compute_energy_batch_gpu(population: np.ndarray) -> np.ndarray:
    """
    GPU OPTIMIZATION: Batch evaluate entire population in parallel.
    This is much faster than individual evaluations.
    """
    n_sequences, n_qubits = population.shape
    
    if GPU_AVAILABLE:
        # Transfer entire population to GPU at once
        pop_gpu = cp.asarray(population, dtype=cp.float32)
        padded = cp.zeros((n_sequences, 2 * n_qubits), dtype=cp.float32)
        padded[:, :n_qubits] = pop_gpu
        
        # Batch FFT (processes all sequences in parallel!)
        fft_results = cp.fft.fft(padded, axis=1)
        power_spectra = cp.abs(fft_results) ** 2
        autocorrs = cp.fft.ifft(power_spectra, axis=1).real[:, :n_qubits]
        
        # Compute energies for all sequences
        energies = cp.sum(autocorrs[:, 1:] ** 2, axis=1)
        
        return energies.get()  # Transfer back to CPU
    else:
        # CPU fallback
        return np.array([compute_energy_fft_gpu(seq) for seq in population])


def compute_merit_factor_fft(sequence: np.ndarray) -> float:
    """Compute merit factor from energy."""
    n = len(sequence)
    energy = compute_energy_fft_gpu(sequence)
    if energy == 0:
        return float('inf')
    return n ** 2 / (2 * energy)


# ============================================================
# CUDA-Q QUANTUM COMPONENTS (Already GPU-accelerated)
# ============================================================
def build_labs_hamiltonian(n_qubits: int) -> cudaq.SpinOperator:
    """Build LABS Hamiltonian for VQE."""
    hamiltonian = cudaq.SpinOperator()
    
    for k in range(1, n_qubits):
        for i in range(n_qubits - k):
            for j in range(n_qubits - k):
                z_count = [0] * n_qubits
                for idx in [i, i + k, j, j + k]:
                    z_count[idx] += 1
                
                term = cudaq.SpinOperator()
                has_z = False
                
                for site in range(n_qubits):
                    if z_count[site] % 2 == 1:
                        if not has_z:
                            term = cudaq.spin.z(site)
                            has_z = True
                        else:
                            term *= cudaq.spin.z(site)
                
                if has_z:
                    hamiltonian += term
                else:
                    hamiltonian += cudaq.SpinOperator()
    
    return hamiltonian


# Global variables for kernel (CUDA-Q requirement)
n_qubits_global = 7
n_layers_global = 5


def set_circuit_params(n_qubits: int, n_layers: int):
    """Set global circuit parameters."""
    global n_qubits_global, n_layers_global
    n_qubits_global = n_qubits
    n_layers_global = n_layers


@cudaq.kernel
def hea_ansatz(qubits: cudaq.qview, params: List[float], n_layers: int):
    """Hardware-efficient ansatz."""
    n_qubits = qubits.size()
    param_idx = 0
    
    for layer in range(n_layers):
        for qubit in range(n_qubits):
            ry(params[param_idx], qubits[qubit])
            param_idx += 1
            rz(params[param_idx], qubits[qubit])
            param_idx += 1
        
        for qubit in range(n_qubits - 1):
            x.ctrl(qubits[qubit], qubits[qubit + 1])
        
        x.ctrl(qubits[n_qubits - 1], qubits[0])


@cudaq.kernel
def vqe_circuit(params: List[float]):
    qubits = cudaq.qvector(n_qubits_global)
    hea_ansatz(qubits, params, n_layers_global)


@cudaq.kernel
def vqe_circuit_measure(params: List[float]):
    qubits = cudaq.qvector(n_qubits_global)
    hea_ansatz(qubits, params, n_layers_global)
    mz(qubits)


def run_vqe_optimization(
    hamiltonian: cudaq.SpinOperator,
    initial_params: np.ndarray,
    maxiter: int = 200
) -> Tuple[np.ndarray, float, List[float]]:
    """
    VQE optimization - already GPU-accelerated via CUDA-Q.
    cudaq.observe() automatically uses GPU when available.
    """
    energy_history = []
    
    def objective(params):
        result = cudaq.observe(vqe_circuit, hamiltonian, params.tolist())
        energy = result.expectation()
        energy_history.append(energy)
        return energy
    
    result = minimize(
        objective,
        initial_params,
        method='COBYLA',
        options={'maxiter': maxiter, 'rhobeg': 0.5, 'tol': 1e-6}
    )
    
    return result.x, result.fun, energy_history


def sample_population(optimized_params: np.ndarray, n_samples: int = 100, seed: int = 42):
    """Sample from optimized VQE state."""
    np.random.seed(seed)
    counts = cudaq.sample(vqe_circuit_measure, optimized_params.tolist(), 
                          shots_count=n_samples)
    return counts


def convert_sample_to_arr(sample, N, shots=100): 
    """Convert CUDA-Q sample to numpy array."""
    arr = np.zeros((shots, N), dtype=int)
    idx = 0
    for bitstring, count in sample.items():
        for _ in range(count):
            row = np.array([int(b) for b in bitstring], dtype=int)
            row[row == 0] = -1
            arr[idx, :] = row
            idx += 1
    return arr


def generate_quantum_pop(n_qubits: int, n_layers: int, n_pop: int, max_iter: int = 200):
    """Generate quantum population via VQE."""
    set_circuit_params(n_qubits, n_layers)
    n_params = 2 * n_qubits * n_layers

    print("\n[1] Building LABS Hamiltonian...")
    hamiltonian = build_labs_hamiltonian(n_qubits)

    print("\n[2] Initializing variational parameters...")
    np.random.seed(42)
    initial_params = np.random.uniform(-np.pi/4, np.pi/4, n_params)

    print("\n[3] Running VQE optimization (GPU-accelerated via CUDA-Q)...")
    optimized_params, final_energy, history = run_vqe_optimization(
        hamiltonian, initial_params, maxiter=max_iter
    )

    print(f"    Final energy: {final_energy:.6f}")

    print("\n[4] Sampling population for classical seeding...")
    counts = sample_population(optimized_params, n_samples=n_pop)

    quantum_pop = convert_sample_to_arr(counts, n_qubits, n_pop)

    print(f"\n[5] Population generated, Size: {np.shape(quantum_pop)}")
    return history, quantum_pop


# ============================================================
# GPU-OPTIMIZED TABU SEARCH
# ============================================================
@dataclass
class TabuSearchConfig:
    tabu_tenure: int = 7
    max_iterations: int = 1000
    aspiration_threshold: float = 0.0


class TabuList:
    def __init__(self, tenure: int):
        self.tenure = tenure
        self.tabu_moves = deque(maxlen=tenure)
        self.tabu_set = set()
    
    def add(self, move: int):
        if len(self.tabu_moves) == self.tenure:
            old_move = self.tabu_moves[0]
            self.tabu_set.discard(old_move)
        self.tabu_moves.append(move)
        self.tabu_set.add(move)
    
    def is_tabu(self, move: int) -> bool:
        return move in self.tabu_set
    
    def clear(self):
        self.tabu_moves.clear()
        self.tabu_set.clear()


def evaluate_all_flips_gpu(sequence: np.ndarray) -> np.ndarray:
    """
    GPU OPTIMIZATION: Evaluate all possible single-bit flips in parallel.
    Instead of flipping and computing one at a time, do all at once.
    """
    n = len(sequence)
    
    if GPU_AVAILABLE:
        # Create all flipped variants at once
        seq_gpu = cp.asarray(sequence, dtype=cp.float32)
        # Tile sequence n times
        all_variants = cp.tile(seq_gpu, (n, 1))
        # Flip diagonal (each row flips a different position)
        flip_mask = cp.eye(n, dtype=cp.float32) * -2  # -2 because we want to flip: x -> -x means multiply by -1, or add -2x
        all_variants = all_variants + flip_mask * cp.tile(seq_gpu, (n, 1))
        
        # Alternative: direct flip
        all_variants = cp.tile(seq_gpu, (n, 1))
        for i in range(n):
            all_variants[i, i] *= -1
        
        # Batch compute energies
        padded = cp.zeros((n, 2 * n), dtype=cp.float32)
        padded[:, :n] = all_variants
        
        fft_results = cp.fft.fft(padded, axis=1)
        power_spectra = cp.abs(fft_results) ** 2
        autocorrs = cp.fft.ifft(power_spectra, axis=1).real[:, :n]
        energies = cp.sum(autocorrs[:, 1:] ** 2, axis=1)
        
        return energies.get()
    else:
        # CPU fallback
        energies = np.zeros(n)
        for i in range(n):
            test_seq = sequence.copy()
            test_seq[i] *= -1
            energies[i] = compute_energy_fft_gpu(test_seq)
        return energies


def tabu_search_local_gpu(
    sequence: np.ndarray, 
    config: TabuSearchConfig, 
    verbose: bool = False
) -> Tuple[np.ndarray, float, int]:
    """GPU-optimized tabu search."""
    n = len(sequence)
    current = sequence.copy()
    current_energy = compute_energy_fft_gpu(current)
    
    best = current.copy()
    best_energy = current_energy
    
    tabu_list = TabuList(config.tabu_tenure)
    
    iterations_without_improvement = 0
    total_iterations = 0
    
    while iterations_without_improvement < config.max_iterations:
        total_iterations += 1
        
        # GPU: Evaluate ALL flips in parallel
        flip_energies = evaluate_all_flips_gpu(current)
        
        # Find best moves
        best_move = int(np.argmin(flip_energies))
        best_move_energy = flip_energies[best_move]
        
        # Find best non-tabu move
        best_non_tabu_move = -1
        best_non_tabu_energy = float('inf')
        
        sorted_indices = np.argsort(flip_energies)
        for idx in sorted_indices:
            if not tabu_list.is_tabu(idx):
                best_non_tabu_move = idx
                best_non_tabu_energy = flip_energies[idx]
                break
        
        # Aspiration criterion
        if best_move_energy < best_energy - config.aspiration_threshold:
            chosen_move = best_move
            chosen_energy = best_move_energy
        elif best_non_tabu_move >= 0:
            chosen_move = best_non_tabu_move
            chosen_energy = best_non_tabu_energy
        else:
            break
        
        # Apply move
        current[chosen_move] *= -1
        current_energy = chosen_energy
        tabu_list.add(chosen_move)
        
        # Update best
        if current_energy < best_energy:
            best = current.copy()
            best_energy = current_energy
            iterations_without_improvement = 0
        else:
            iterations_without_improvement += 1
    
    return best, best_energy, total_iterations


# ============================================================
# GPU-OPTIMIZED MEMETIC TABU SEARCH
# ============================================================
@dataclass
class MTSConfig:
    population_size: int = 50
    elite_size: int = 5
    tabu_tenure: int = 7
    local_search_iterations: int = 100
    crossover_rate: float = 0.8
    mutation_rate: float = 0.1
    max_generations: int = 100
    stagnation_limit: int = 20
    intensify_threshold: int = 5
    diversify_threshold: int = 10
    # GPU optimization: number of individuals to apply local search to
    parallel_local_search: int = 5


class MemeticTabuSearchGPU:
    """GPU-optimized Memetic Tabu Search."""
    
    def __init__(self, n_qubits: int, config: MTSConfig = None):
        self.n_qubits = n_qubits
        self.config = config or MTSConfig()
        
        self.best_sequence = None
        self.best_energy = float('inf')
        self.best_merit_factor = 0.0
        self.history = []
        
        self.total_evaluations = 0
        self.generation = 0
        
        # Timing statistics
        self.timing = {
            'fitness_eval': 0.0,
            'local_search': 0.0,
            'genetic_ops': 0.0
        }
    
    def evaluate_population(self, population: np.ndarray) -> np.ndarray:
        """GPU OPTIMIZATION: Batch fitness evaluation."""
        start = time.time()
        fitness = compute_energy_batch_gpu(population)
        self.timing['fitness_eval'] += time.time() - start
        self.total_evaluations += len(population)
        return fitness
    
    def select_parents_gpu(self, population: np.ndarray, fitness: np.ndarray) -> np.ndarray:
        """Vectorized tournament selection."""
        tournament_size = 3
        n_parents = len(population)
        
        # Generate all tournament indices at once
        all_tournaments = np.random.randint(0, len(population), 
                                            size=(n_parents, tournament_size))
        
        # Find winners
        tournament_fitness = fitness[all_tournaments]
        winner_local_idx = np.argmin(tournament_fitness, axis=1)
        winner_idx = all_tournaments[np.arange(n_parents), winner_local_idx]
        
        return population[winner_idx].copy()
    
    def crossover_batch_gpu(self, parents: np.ndarray) -> np.ndarray:
        """GPU OPTIMIZATION: Vectorized batch crossover."""
        n_pairs = len(parents) // 2
        n = self.n_qubits
        
        if GPU_AVAILABLE and n_pairs > 10:
            parents_gpu = cp.asarray(parents)
            offspring = cp.zeros_like(parents_gpu)
            
            # Generate crossover decisions
            do_crossover = cp.random.random(n_pairs) < self.config.crossover_rate
            
            # Generate crossover points
            points = cp.sort(cp.random.randint(0, n, size=(n_pairs, 2)), axis=1)
            
            for i in range(n_pairs):
                p1, p2 = parents_gpu[2*i], parents_gpu[2*i + 1]
                
                if do_crossover[i]:
                    pt1, pt2 = int(points[i, 0]), int(points[i, 1])
                    c1 = cp.concatenate([p1[:pt1], p2[pt1:pt2], p1[pt2:]])
                    c2 = cp.concatenate([p2[:pt1], p1[pt1:pt2], p2[pt2:]])
                else:
                    c1, c2 = p1.copy(), p2.copy()
                
                offspring[2*i] = c1
                offspring[2*i + 1] = c2
            
            return offspring.get()
        else:
            # CPU fallback
            offspring = []
            for i in range(0, len(parents) - 1, 2):
                c1, c2 = self._crossover_pair(parents[i], parents[i+1])
                offspring.extend([c1, c2])
            if len(parents) % 2 == 1:
                offspring.append(parents[-1].copy())
            return np.array(offspring[:self.config.population_size])
    
    def _crossover_pair(self, p1: np.ndarray, p2: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
        if np.random.random() > self.config.crossover_rate:
            return p1.copy(), p2.copy()
        
        n = len(p1)
        points = sorted(np.random.choice(n, size=2, replace=False))
        
        c1, c2 = p1.copy(), p2.copy()
        c1[points[0]:points[1]] = p2[points[0]:points[1]]
        c2[points[0]:points[1]] = p1[points[0]:points[1]]
        
        return c1, c2
    
    def mutate_batch_gpu(self, population: np.ndarray) -> np.ndarray:
        """GPU OPTIMIZATION: Vectorized batch mutation."""
        if GPU_AVAILABLE:
            pop_gpu = cp.asarray(population)
            # Generate mutation mask for entire population at once
            mutation_mask = cp.random.random(pop_gpu.shape) < self.config.mutation_rate
            # Apply mutations: flip where mask is True
            pop_gpu = cp.where(mutation_mask, -pop_gpu, pop_gpu)
            return pop_gpu.get()
        else:
            mutated = population.copy()
            mask = np.random.random(mutated.shape) < self.config.mutation_rate
            mutated[mask] *= -1
            return mutated
    
    def local_search_parallel(self, population: np.ndarray, fitness: np.ndarray, 
                              n_apply: int = 5) -> Tuple[np.ndarray, np.ndarray]:
        """Apply local search to top n individuals."""
        start = time.time()
        
        # Select top individuals for local search
        top_indices = np.argsort(fitness)[:n_apply]
        
        config = TabuSearchConfig(
            tabu_tenure=self.config.tabu_tenure,
            max_iterations=self.config.local_search_iterations
        )
        
        for idx in top_indices:
            improved, energy, _ = tabu_search_local_gpu(population[idx], config)
            population[idx] = improved
            fitness[idx] = energy
        
        self.timing['local_search'] += time.time() - start
        return population, fitness
    
    def intensification(self, population: np.ndarray, fitness: np.ndarray) -> np.ndarray:
        elite_indices = np.argsort(fitness)[:self.config.elite_size]
        elite = population[elite_indices]
        
        new_population = list(population)
        for elite_seq in elite:
            for _ in range(2):
                variant = elite_seq.copy()
                n_flips = np.random.randint(1, 3)
                flip_positions = np.random.choice(len(variant), size=n_flips, replace=False)
                for pos in flip_positions:
                    variant[pos] *= -1
                new_population.append(variant)
        
        new_population = np.array(new_population)
        new_fitness = self.evaluate_population(new_population)
        best_indices = np.argsort(new_fitness)[:self.config.population_size]
        
        return new_population[best_indices]
    
    def diversification(self, population: np.ndarray, fitness: np.ndarray) -> np.ndarray:
        elite_indices = np.argsort(fitness)[:self.config.elite_size]
        elite = population[elite_indices]
        
        n_new = self.config.population_size - self.config.elite_size
        new_solutions = np.random.choice([-1, 1], size=(n_new, self.n_qubits))
        
        return np.vstack([elite, new_solutions])
    
    def run(self, quantum_seeds: np.ndarray, verbose: bool = True) -> Tuple[np.ndarray, float, float]:
        
        if verbose:
            print(f"GPU Acceleration: {'ENABLED' if GPU_AVAILABLE else 'DISABLED'}")
            print(f"Sequence length: {self.n_qubits}")
            print(f"Population size: {self.config.population_size}")
        
        population = quantum_seeds.copy()
        fitness = self.evaluate_population(population)
        
        best_idx = np.argmin(fitness)
        self.best_sequence = population[best_idx].copy()
        self.best_energy = fitness[best_idx]
        self.best_merit_factor = compute_merit_factor_fft(self.best_sequence)
        
        stagnation_counter = 0
        
        if verbose:
            print(f"\nInitial best: E={self.best_energy:.2f}, MF={self.best_merit_factor:.4f}")
            print("\n[Generation Progress]")
        
        for gen in range(self.config.max_generations):
            self.generation = gen
            gen_start = time.time()
            
            # Selection
            parents = self.select_parents_gpu(population, fitness)
            
            # Crossover and Mutation (GPU-optimized)
            start = time.time()
            offspring = self.crossover_batch_gpu(parents)
            offspring = self.mutate_batch_gpu(offspring)
            self.timing['genetic_ops'] += time.time() - start
            
            # Evaluate offspring
            offspring_fitness = self.evaluate_population(offspring)
            
            # Parallel local search on top individuals
            offspring, offspring_fitness = self.local_search_parallel(
                offspring, offspring_fitness, 
                n_apply=self.config.parallel_local_search
            )
            
            # Elitism
            elite_indices = np.argsort(fitness)[:self.config.elite_size]
            worst_offspring_indices = np.argsort(offspring_fitness)[-self.config.elite_size:]
            
            for i, elite_idx in enumerate(elite_indices):
                offspring[worst_offspring_indices[i]] = population[elite_idx].copy()
                offspring_fitness[worst_offspring_indices[i]] = fitness[elite_idx]
            
            population = offspring
            fitness = offspring_fitness
            
            # Update best
            gen_best_idx = np.argmin(fitness)
            gen_best_energy = fitness[gen_best_idx]
            
            improved = False
            if gen_best_energy < self.best_energy:
                self.best_sequence = population[gen_best_idx].copy()
                self.best_energy = gen_best_energy
                self.best_merit_factor = compute_merit_factor_fft(self.best_sequence)
                stagnation_counter = 0
                improved = True
            else:
                stagnation_counter += 1
            
            gen_time = time.time() - gen_start
            
            self.history.append({
                'generation': gen,
                'best_energy': self.best_energy,
                'merit_factor': self.best_merit_factor,
                'gen_best_energy': gen_best_energy,
                'gen_mean_energy': np.mean(fitness),
                'gen_std_energy': np.std(fitness),
                'gen_time': gen_time,
                'improved': improved
            })
            
            if verbose and (gen % 10 == 0 or improved):
                print(f"  Gen {gen:3d}: Best E={self.best_energy:.2f}, "
                      f"MF={self.best_merit_factor:.4f}, Time={gen_time:.3f}s"
                      + (" *" if improved else ""))
            
            # Intensification/Diversification
            if stagnation_counter == self.config.intensify_threshold:
                if verbose:
                    print(f"  Gen {gen}: Applying intensification...")
                population = self.intensification(population, fitness)
                fitness = self.evaluate_population(population)
            
            elif stagnation_counter == self.config.diversify_threshold:
                if verbose:
                    print(f"  Gen {gen}: Applying diversification...")
                population = self.diversification(population, fitness)
                fitness = self.evaluate_population(population)
            
            if stagnation_counter >= self.config.stagnation_limit:
                if verbose:
                    print(f"\n  Stopping: No improvement for {stagnation_counter} generations")
                break
        
        if verbose:
            print("\n" + "-" * 70)
            print(f"Timing Breakdown:")
            print(f"  Fitness evaluation: {self.timing['fitness_eval']:.2f}s")
            print(f"  Local search: {self.timing['local_search']:.2f}s")
            print(f"  Genetic operations: {self.timing['genetic_ops']:.2f}s")
        
        return self.best_sequence, self.best_energy, self.best_merit_factor
        
 
def run_runtime_benchmark(qubit_range: List[int], n_layers: int = 5, 
                          n_pop: int = 50, max_gen: int = 20) -> dict:
    """
    Benchmark runtime scaling across different qubit counts.
    """
    results = {
        'qubits': qubit_range,
        'vqe_times': [],
        'mts_times': [],
        'total_times': [],
        'best_energies': [],
        'merit_factors': []
    }
    
    print("=" * 70)
    print("RUNTIME SCALING BENCHMARK")
    print("=" * 70)
    
    for n_qubits in qubit_range:
        print(f"\n--- Benchmarking {n_qubits} qubits ---")
        
        # VQE phase
        vqe_start = time.time()
        vqe_history, quantum_pop = generate_quantum_pop(n_qubits, n_layers, n_pop, max_iter=100)
        vqe_time = time.time() - vqe_start
        
        # MTS phase
        config = MTSConfig(
            population_size=n_pop,
            max_generations=max_gen,
            local_search_iterations=50,
            tabu_tenure=max(5, n_qubits // 2)
        )
        
        mts = MemeticTabuSearchGPU(n_qubits, config)
        mts_start = time.time()
        best_seq, best_energy, best_mf = mts.run(quantum_pop, verbose=False)
        mts_time = time.time() - mts_start
        
        results['vqe_times'].append(vqe_time)
        results['mts_times'].append(mts_time)
        results['total_times'].append(vqe_time + mts_time)
        results['best_energies'].append(best_energy)
        results['merit_factors'].append(best_mf)
        
        print(f"  VQE time: {vqe_time:.2f}s, MTS time: {mts_time:.2f}s")
        print(f"  Best energy: {best_energy:.2f}, Merit factor: {best_mf:.4f}")
    
    return results


def run_complete_hybrid_workflow_gpu(
    quantum_population: np.ndarray,
    n_qubits: int,
    mts_config: Optional[MTSConfig] = None,
    verbose: bool = True
) -> Tuple[np.ndarray, float, float, dict]:
    """GPU-optimized hybrid workflow."""
    
    if verbose:
        print("\n" + "=" * 70)
        print("GPU-ACCELERATED QUANTUM-CLASSICAL HYBRID WORKFLOW")
        print("=" * 70)
        print(f"\nGPU Status: {'ENABLED (CuPy)' if GPU_AVAILABLE else 'DISABLED (NumPy fallback)'}")
        print(f"Quantum input: {len(quantum_population)} sequences of length {n_qubits}")
    
    if mts_config is None:
        mts_config = MTSConfig(
            population_size=max(100, len(quantum_population)),
            max_generations=100,
            local_search_iterations=100,
            tabu_tenure=max(5, n_qubits // 2),
            stagnation_limit=30,
            parallel_local_search=5
        )
    
    mts = MemeticTabuSearchGPU(n_qubits, mts_config)
    
    start_time = time.time()
    best_seq, best_energy, best_mf = mts.run(
        quantum_seeds=quantum_population,
        verbose=verbose
    )
    elapsed_time = time.time() - start_time
    
    statistics = {
        'total_evaluations': mts.total_evaluations,
        'generations': mts.generation + 1,
        'elapsed_time': elapsed_time,
        'history': mts.history,
        'quantum_seed_size': len(quantum_population),
        'timing_breakdown': mts.timing
    }
    
    if verbose:
        print(f"\nWorkflow Statistics:")
        print(f"  Total evaluations: {statistics['total_evaluations']}")
        print(f"  Generations: {statistics['generations']}")
        print(f"  Total time: {elapsed_time:.2f} seconds")
    
    return best_seq, best_energy, best_mf, statistics


In [None]:
# ============================================================
# MAIN EXECUTION
# ============================================================
if __name__ == "__main__":
    # Configuration
    n_qubits = 7
    n_layers = 5
    n_pop = 100
    
    print("=" * 70)
    print("LABS OPTIMIZATION - GPU-ACCELERATED HYBRID QUANTUM-CLASSICAL")
    print("=" * 70)
    
    # Generate quantum population
    vqe_history, quantum_population = generate_quantum_pop(n_qubits, n_layers, n_pop)
    print(f"Quantum population shape: {np.shape(quantum_population)}")
    
    # Configure MTS
    config = MTSConfig(
        population_size=n_pop,
        max_generations=50,
        local_search_iterations=75,
        tabu_tenure=6,
        parallel_local_search=5
    )
    
    # Run hybrid workflow
    best_seq, best_energy, best_mf, stats = run_complete_hybrid_workflow_gpu(
        quantum_population=quantum_population,
        n_qubits=n_qubits,
        mts_config=config,
        verbose=True
    )
    
    print(f"\n{'='*70}")
    print("FINAL RESULT")
    print(f"{'='*70}")
    print(f"Best sequence: {best_seq.tolist()}")
    print(f"Energy: {best_energy:.2f}")