diff --git a/Makefile b/Makefile
index 504a9a8..ad1a428 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,4 @@
-.PHONY: test test-race test-all lint vet fuzz bench coverage clean hooks
+.PHONY: test test-race test-all lint vet fuzz bench bench-gpu-cuda test-gpu-cuda coverage clean hooks
 
 test:
 	go test -count=1 -timeout=5m ./...
@@ -10,6 +10,8 @@ test-all: test-race
 	cd backend/braket && go test -race -count=1 -timeout=5m ./...
 	cd observe/otelbridge && go build ./...
 	cd observe/prombridge && go build ./...
+	cd sim/gpu/cuda && go build ./...
+	cd sim/gpu/metal && go build ./...
 
 lint:
 	golangci-lint run ./...
@@ -20,6 +22,8 @@ vet:
 	cd backend/braket && go vet ./...
 	cd observe/otelbridge && go vet ./...
 	cd observe/prombridge && go vet ./...
+	cd sim/gpu/cuda && go vet ./...
+	cd sim/gpu/metal && go vet ./...
 
 fuzz:
 	go test ./qasm/parser -run=^$$ -fuzz=FuzzParse -fuzztime=30s
@@ -41,6 +45,12 @@ bench:
 	go test ./sim/statevector/ -bench=. -count=5 -benchmem -run=^$$ -timeout=10m
 	go test ./sim/densitymatrix/ -bench=. -count=5 -benchmem -run=^$$ -timeout=10m
 
+test-gpu-cuda:
+	cd sim/gpu/cuda && go test -tags cuda -count=1 -timeout=5m ./...
+
+bench-gpu-cuda:
+	cd sim/gpu/cuda && go test -tags cuda -bench=. -count=5 -benchmem -run=^$$ -timeout=10m
+
 coverage:
 	go test -coverprofile=coverage.out ./...
 	go tool cover -html=coverage.out -o coverage.html
diff --git a/backend/local/local.go b/backend/local/local.go
index 1567aa1..1198d7c 100644
--- a/backend/local/local.go
+++ b/backend/local/local.go
@@ -12,6 +12,7 @@ import (
 
 	"github.com/splch/goqu/backend"
 	"github.com/splch/goqu/observe"
+	"github.com/splch/goqu/sim"
 	"github.com/splch/goqu/sim/pulsesim"
 	"github.com/splch/goqu/sim/statevector"
 	"github.com/splch/goqu/transpile/target"
@@ -19,11 +20,16 @@ import (
 
 var _ backend.Backend = (*Backend)(nil)
 
+// SimFactory creates a Simulator for the given number of qubits.
+// The default factory creates a CPU statevector simulator.
+type SimFactory func(numQubits int) (sim.Simulator, error)
+
 // Backend runs circuits on the local statevector simulator.
 type Backend struct {
-	maxQubits int
-	results   sync.Map // jobID → *backend.Result
-	logger    *slog.Logger
+	maxQubits  int
+	simFactory SimFactory
+	results    sync.Map // jobID → *backend.Result
+	logger     *slog.Logger
 }
 
 // Option configures a local Backend.
@@ -39,9 +45,21 @@ func WithLogger(l *slog.Logger) Option {
 	return func(b *Backend) { b.logger = l }
 }
 
+// WithSimulator sets a custom simulator factory. This allows plugging in
+// alternative simulators (e.g., GPU-accelerated) while keeping the same backend API.
+func WithSimulator(f SimFactory) Option {
+	return func(b *Backend) { b.simFactory = f }
+}
+
 // New creates a local simulator backend.
 func New(opts ...Option) *Backend {
-	b := &Backend{maxQubits: 28, logger: slog.Default()}
+	b := &Backend{
+		maxQubits: 28,
+		simFactory: func(numQubits int) (sim.Simulator, error) {
+			return statevector.New(numQubits), nil
+		},
+		logger: slog.Default(),
+	}
 	for _, opt := range opts {
 		opt(b)
 	}
@@ -92,8 +110,15 @@ func (b *Backend) Submit(ctx context.Context, req *backend.SubmitRequest) (*back
 	)
 
 	start := time.Now()
-	sim := statevector.New(nq)
-	counts, err := sim.Run(req.Circuit, req.Shots)
+	s, sErr := b.simFactory(nq)
+	if sErr != nil {
+		if simDone != nil {
+			simDone(sErr)
+		}
+		return nil, fmt.Errorf("local: %w", sErr)
+	}
+	defer func() { _ = s.Close() }()
+	counts, err := s.Run(req.Circuit, req.Shots)
 	elapsed := time.Since(start)
 
 	if simDone != nil {
diff --git a/sim/densitymatrix/sim.go b/sim/densitymatrix/sim.go
index fcca2eb..b3e11bd 100644
--- a/sim/densitymatrix/sim.go
+++ b/sim/densitymatrix/sim.go
@@ -12,6 +12,9 @@ import (
 	"github.com/splch/goqu/sim/pauli"
 )
 
+// Close is a no-op for the CPU density matrix simulator.
+func (s *Sim) Close() error { return nil }
+
 // parallelThreshold is the minimum number of qubits before enabling parallel kernels.
 // At 9 qubits dim=512 and the density matrix has 262K elements; the heavier
 // per-element work (row + column passes) justifies a lower threshold than statevector.
diff --git a/sim/gpu/cuda/benchmark_test.go b/sim/gpu/cuda/benchmark_test.go
new file mode 100644
index 0000000..cd38997
--- /dev/null
+++ b/sim/gpu/cuda/benchmark_test.go
@@ -0,0 +1,64 @@
+//go:build cuda
+
+package cuda
+
+import (
+	"testing"
+
+	"github.com/splch/goqu/circuit/builder"
+	"github.com/splch/goqu/sim/statevector"
+)
+
+// BenchmarkGPU_GHZ benchmarks GPU GHZ circuit creation across qubit counts.
+func BenchmarkGPU_GHZ(b *testing.B) {
+	for _, nq := range []int{12, 16, 20, 24, 28} {
+		bld := builder.New("ghz", nq)
+		bld.H(0)
+		for i := range nq - 1 {
+			bld.CNOT(i, i+1)
+		}
+		c, err := bld.Build()
+		if err != nil {
+			b.Fatal(err)
+		}
+
+		b.Run(qName("GPU", nq), func(b *testing.B) {
+			sim, err := New(nq)
+			if err != nil {
+				b.Skip("CUDA not available:", err)
+			}
+			defer sim.Close()
+			b.ResetTimer()
+			for range b.N {
+				sim.Evolve(c)
+			}
+		})
+	}
+}
+
+// BenchmarkCPU_GHZ benchmarks CPU GHZ for comparison.
+func BenchmarkCPU_GHZ(b *testing.B) {
+	for _, nq := range []int{12, 16, 20, 24, 28} {
+		bld := builder.New("ghz", nq)
+		bld.H(0)
+		for i := range nq - 1 {
+			bld.CNOT(i, i+1)
+		}
+		c, err := bld.Build()
+		if err != nil {
+			b.Fatal(err)
+		}
+
+		b.Run(qName("CPU", nq), func(b *testing.B) {
+			sim := statevector.New(nq)
+			b.ResetTimer()
+			for range b.N {
+				sim.Evolve(c)
+			}
+		})
+	}
+}
+
+func qName(prefix string, nq int) string {
+	return prefix + "_" + string(rune('0'+nq/10)) + string(rune('0'+nq%10)) + "Q"
+}
diff --git a/sim/gpu/cuda/expect.go b/sim/gpu/cuda/expect.go
new file mode 100644
index 0000000..53a79f9
--- /dev/null
+++ b/sim/gpu/cuda/expect.go
@@ -0,0 +1,79 @@
+//go:build cuda
+
+package cuda
+
+/*
+#include <custatevec.h>
+
+// goPauliExpect computes ⟨ψ|P|ψ⟩ for a Pauli string on the GPU.
+static custatevecStatus_t goPauliExpect(
+    custatevecHandle_t handle,
+    const void *sv, cudaDataType_t svDataType, int nQubits,
+    double *expectationValue,
+    const custatevecPauli_t *pauliOps, const int32_t *basisBits, int nBasisBits
+) {
+    return custatevecComputeExpectation(
+        handle, sv, svDataType, (uint32_t)nQubits,
+        expectationValue, CUDA_R_64F,
+        pauliOps, basisBits, (uint32_t)nBasisBits
+    );
+}
+*/
+import "C"
+import (
+	"fmt"
+
+	"github.com/splch/goqu/sim/pauli"
+)
+
+// pauliToCUSV maps goqu Pauli values to cuStateVec Pauli enum values.
+func pauliToCUSV(p pauli.Pauli) C.custatevecPauli_t {
+	switch p {
+	case pauli.I:
+		return C.CUSTATEVEC_PAULI_I
+	case pauli.X:
+		return C.CUSTATEVEC_PAULI_X
+	case pauli.Y:
+		return C.CUSTATEVEC_PAULI_Y
+	case pauli.Z:
+		return C.CUSTATEVEC_PAULI_Z
+	default:
+		return C.CUSTATEVEC_PAULI_I
+	}
+}
+
+// ExpectPauliString computes Re(⟨ψ|P|ψ⟩) for a Pauli string P on the GPU.
+func (s *Sim) ExpectPauliString(ps pauli.PauliString) (float64, error) {
+	if ps.NumQubits() != s.numQubits {
+		return 0, fmt.Errorf("cuda: PauliString has %d qubits, simulator has %d",
+			ps.NumQubits(), s.numQubits)
+	}
+
+	ops := ps.Ops()
+	nBasis := len(ops)
+	if nBasis == 0 {
+		return real(ps.Coeff()), nil
+	}
+
+	pauliOps := make([]C.custatevecPauli_t, nBasis)
+	basisBits := make([]C.int32_t, nBasis)
+	i := 0
+	for qubit, p := range ops {
+		pauliOps[i] = pauliToCUSV(p)
+		basisBits[i] = C.int32_t(qubit)
+		i++
+	}
+
+	var expect C.double
+	st := C.goPauliExpect(
+		s.handle.h,
+		s.devicePtr.ptr, C.CUDA_C_64F, C.int(s.numQubits),
+		&expect,
+		&pauliOps[0], &basisBits[0], C.int(nBasis),
+	)
+	if st != C.CUSTATEVEC_STATUS_SUCCESS {
+		return 0, fmt.Errorf("custatevecComputeExpectation failed: status %d", int(st))
+	}
+
+	return real(ps.Coeff()) * float64(expect), nil
+}
diff --git a/sim/gpu/cuda/go.mod b/sim/gpu/cuda/go.mod
new file mode 100644
index 0000000..36aae87
--- /dev/null
+++ b/sim/gpu/cuda/go.mod
@@ -0,0 +1,7 @@
+module github.com/splch/goqu/sim/gpu/cuda
+
+go 1.24
+
+require github.com/splch/goqu v0.0.0
+
+replace github.com/splch/goqu => ../../../
diff --git a/sim/gpu/cuda/handle.go b/sim/gpu/cuda/handle.go
new file mode 100644
index 0000000..81d71c2
--- /dev/null
+++ b/sim/gpu/cuda/handle.go
@@ -0,0 +1,51 @@
+//go:build cuda
+
+package cuda
+
+/*
+#cgo LDFLAGS: -lcustatevec -lcudart
+#include <custatevec.h>
+#include <cuda_runtime.h>
+
+// createHandle wraps custatevecCreate.
+static custatevecStatus_t goCreateHandle(custatevecHandle_t *handle) {
+    return custatevecCreate(handle);
+}
+
+// destroyHandle wraps custatevecDestroy.
+static custatevecStatus_t goDestroyHandle(custatevecHandle_t handle) {
+    return custatevecDestroy(handle);
+}
+*/
+import "C"
+import (
+	"fmt"
+	"unsafe"
+)
+
+type cusvHandle struct {
+	h      C.custatevecHandle_t
+	stream C.cudaStream_t
+}
+
+type deviceAlloc struct {
+	ptr  unsafe.Pointer
+	size int // number of complex128 elements
+}
+
+func createHandle() (cusvHandle, error) {
+	var h cusvHandle
+	if st := C.goCreateHandle(&h.h); st != C.CUSTATEVEC_STATUS_SUCCESS {
+		return h, fmt.Errorf("custatevecCreate failed: status %d", int(st))
+	}
+	if st := C.cudaStreamCreate(&h.stream); st != C.cudaSuccess {
+		C.goDestroyHandle(h.h)
+		return h, fmt.Errorf("cudaStreamCreate failed: status %d", int(st))
+	}
+	return h, nil
+}
+
+func destroyHandle(h cusvHandle) {
+	C.cudaStreamDestroy(h.stream)
+	C.goDestroyHandle(h.h)
+}
diff --git a/sim/gpu/cuda/kernels.go b/sim/gpu/cuda/kernels.go
new file mode 100644
index 0000000..384861f
--- /dev/null
+++ b/sim/gpu/cuda/kernels.go
@@ -0,0 +1,253 @@
+//go:build cuda
+
+package cuda
+
+/*
+#include <custatevec.h>
+
+// applyMatrix wraps custatevecApplyMatrix for gate application.
+static custatevecStatus_t goApplyMatrix(
+    custatevecHandle_t handle,
+    void *sv, cudaDataType_t svDataType, int nQubits,
+    const void *matrix, cudaDataType_t matrixDataType,
+    custatevecMatrixLayout_t layout,
+    int adjoint,
+    const int32_t *targets, int nTargets,
+    const int32_t *controls, const int32_t *controlBitValues, int nControls,
+    custatevecComputeType_t computeType,
+    void *workspace, size_t workspaceSize
+) {
+    return custatevecApplyMatrix(
+        handle, sv, svDataType, (uint32_t)nQubits,
+        matrix, matrixDataType, layout, (int32_t)adjoint,
+        targets, (uint32_t)nTargets,
+        controls, controlBitValues, (uint32_t)nControls,
+        computeType, workspace, workspaceSize
+    );
+}
+
+// getApplyMatrixWorkspaceSize wraps custatevecApplyMatrixGetWorkspaceSize.
+static custatevecStatus_t goGetWorkspaceSize(
+    custatevecHandle_t handle,
+    cudaDataType_t svDataType, int nQubits,
+    const void *matrix, cudaDataType_t matrixDataType,
+    custatevecMatrixLayout_t layout,
+    int adjoint,
+    int nTargets, int nControls,
+    custatevecComputeType_t computeType,
+    size_t *workspaceSize
+) {
+    return custatevecApplyMatrixGetWorkspaceSize(
+        handle, svDataType, (uint32_t)nQubits,
+        matrix, matrixDataType, layout, (int32_t)adjoint,
+        (uint32_t)nTargets, (uint32_t)nControls,
+        computeType, workspaceSize
+    );
+}
+*/
+import "C"
+import (
+	"fmt"
+	"math"
+	"unsafe"
+
+	"github.com/splch/goqu/circuit/gate"
+	"github.com/splch/goqu/circuit/ir"
+)
+
+// applyGate sends a gate matrix to the GPU and applies it via custatevecApplyMatrix.
+func applyGate(s *Sim, targets []int, controls []int, m []complex128) error {
+	nTargets := len(targets)
+	nControls := len(controls)
+
+	// Convert to int32 slices for C.
+	tgts := make([]C.int32_t, nTargets)
+	for i, t := range targets {
+		tgts[i] = C.int32_t(t)
+	}
+
+	var ctrlPtr *C.int32_t
+	var ctrlBitsPtr *C.int32_t
+	var ctrlBits []C.int32_t
+	if nControls > 0 {
+		ctrls := make([]C.int32_t, nControls)
+		ctrlBits = make([]C.int32_t, nControls)
+		for i, c := range controls {
+			ctrls[i] = C.int32_t(c)
+			ctrlBits[i] = 1 // control on |1>
+		}
+		ctrlPtr = &ctrls[0]
+		ctrlBitsPtr = &ctrlBits[0]
+	}
+
+	// Query workspace size.
+	var wsSize C.size_t
+	st := C.goGetWorkspaceSize(
+		s.handle.h,
+		C.CUDA_C_64F, C.int(s.numQubits),
+		unsafe.Pointer(&m[0]), C.CUDA_C_64F,
+		C.CUSTATEVEC_MATRIX_LAYOUT_ROW,
+		0,
+		C.int(nTargets), C.int(nControls),
+		C.CUSTATEVEC_COMPUTE_64F,
+		&wsSize,
+	)
+	if st != C.CUSTATEVEC_STATUS_SUCCESS {
+		return fmt.Errorf("custatevecApplyMatrixGetWorkspaceSize failed: status %d", int(st))
+	}
+
+	// Allocate workspace if needed.
+	var wsPtr unsafe.Pointer
+	if wsSize > 0 {
+		if cst := C.cudaMalloc(&wsPtr, wsSize); cst != C.cudaSuccess {
+			return fmt.Errorf("cudaMalloc workspace failed: status %d", int(cst))
+		}
+		defer C.cudaFree(wsPtr)
+	}
+
+	// Apply the gate.
+	st = C.goApplyMatrix(
+		s.handle.h,
+		s.devicePtr.ptr, C.CUDA_C_64F, C.int(s.numQubits),
+		unsafe.Pointer(&m[0]), C.CUDA_C_64F,
+		C.CUSTATEVEC_MATRIX_LAYOUT_ROW,
+		0,
+		&tgts[0], C.int(nTargets),
+		ctrlPtr, ctrlBitsPtr, C.int(nControls),
+		C.CUSTATEVEC_COMPUTE_64F,
+		wsPtr, wsSize,
+	)
+	if st != C.CUSTATEVEC_STATUS_SUCCESS {
+		return fmt.Errorf("custatevecApplyMatrix failed: status %d", int(st))
+	}
+	return nil
+}
+
+// evolve applies all gate operations in the circuit on the GPU.
+func evolve(s *Sim, c *ir.Circuit) error {
+	if c.NumQubits() != s.numQubits {
+		return fmt.Errorf("circuit has %d qubits, simulator has %d", c.NumQubits(), s.numQubits)
+	}
+	if err := resetDevice(s.devicePtr); err != nil {
+		return err
+	}
+
+	for _, op := range c.Ops() {
+		if op.Gate == nil || op.Gate.Name() == "barrier" {
+			continue
+		}
+		if op.Gate.Name() == "reset" {
+			// Transfer to host, reset qubit, transfer back.
+			host := copyToHost(s.devicePtr)
+			resetQubitCPU(host, s.numQubits, op.Qubits[0])
+			C.goMemcpyH2D(s.devicePtr.ptr, unsafe.Pointer(&host[0]), C.size_t(len(host)*16))
+			continue
+		}
+		if _, ok := op.Gate.(gate.StatePrepable); ok {
+			// State prep: copy amplitudes directly to GPU.
+			sp := op.Gate.(gate.StatePrepable)
+			amps := sp.Amplitudes()
+			if len(op.Qubits) == s.numQubits {
+				allInOrder := true
+				for i, q := range op.Qubits {
+					if q != i {
+						allInOrder = false
+						break
+					}
+				}
+				if allInOrder {
+					// Fast path: copy amplitudes directly.
+					C.goMemcpyH2D(s.devicePtr.ptr, unsafe.Pointer(&amps[0]), C.size_t(len(amps)*16))
+					continue
+				}
+			}
+			// Slow path: decompose into 1Q/2Q gates.
+			applied := op.Gate.Decompose(op.Qubits)
+			for _, a := range applied {
+				m := a.Gate.Matrix()
+				if m == nil {
+					continue
+				}
+				if err := applyGate(s, a.Qubits, nil, m); err != nil {
+					return err
+				}
+			}
+			continue
+		}
+
+		m := op.Gate.Matrix()
+		if m == nil {
+			continue
+		}
+
+		if cg, ok := op.Gate.(gate.ControlledGate); ok {
+			nControls := cg.NumControls()
+			controls := op.Qubits[:nControls]
+			targets := op.Qubits[nControls:]
+			innerM := cg.Inner().Matrix()
+			if innerM != nil {
+				if err := applyGate(s, targets, controls, innerM); err != nil {
+					return err
+				}
+				continue
+			}
+		}
+
+		// General gate: all qubits are targets.
+		if err := applyGate(s, op.Qubits, nil, m); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// resetQubitCPU deterministically resets a qubit to |0⟩ on the host side.
+func resetQubitCPU(state []complex128, numQubits, qubit int) {
+	halfBlock := 1 << qubit
+	block := halfBlock << 1
+	nAmps := 1 << numQubits
+	for b0 := 0; b0 < nAmps; b0 += block {
+		for offset := range halfBlock {
+			i0 := b0 + offset
+			i1 := i0 + halfBlock
+			a0, a1 := state[i0], state[i1]
+			norm := math.Sqrt(real(a0)*real(a0) + imag(a0)*imag(a0) +
+				real(a1)*real(a1) + imag(a1)*imag(a1))
+			if norm > 1e-15 {
+				state[i0] = complex(norm, 0)
+			} else {
+				state[i0] = 0
+			}
+			state[i1] = 0
+		}
+	}
+}
+
+// newSim creates a GPU simulator with cuStateVec.
+func newSim(numQubits int) (*Sim, error) {
+	if numQubits < 1 {
+		return nil, fmt.Errorf("cuda: numQubits must be >= 1, got %d", numQubits)
+	}
+	h, err := createHandle()
+	if err != nil {
+		return nil, err
+	}
+	d, err := allocDevice(numQubits)
+	if err != nil {
+		destroyHandle(h)
+		return nil, err
+	}
+	return &Sim{numQubits: numQubits, handle: h, devicePtr: d}, nil
+}
+
+// stateVector copies the full state from GPU to host.
+func stateVector(s *Sim) []complex128 {
+	return copyToHost(s.devicePtr)
+}
+
+// closeSim frees GPU resources.
+func closeSim(s *Sim) error {
+	freeDevice(&s.devicePtr)
+	destroyHandle(s.handle)
+	return nil
+}
diff --git a/sim/gpu/cuda/kernels_stub.go b/sim/gpu/cuda/kernels_stub.go
new file mode 100644
index 0000000..b58936a
--- /dev/null
+++ b/sim/gpu/cuda/kernels_stub.go
@@ -0,0 +1,39 @@
+//go:build !cuda
+
+package cuda
+
+import (
+	"fmt"
+	"unsafe"
+
+	"github.com/splch/goqu/circuit/ir"
+)
+
+type cusvHandle struct{}
+
+type deviceAlloc struct {
+	ptr  unsafe.Pointer
+	size int
+}
+
+var errNoCUDA = fmt.Errorf("cuda: not available (build with -tags cuda)")
+
+func newSim(numQubits int) (*Sim, error) {
+	return nil, errNoCUDA
+}
+
+func run(_ *Sim, _ *ir.Circuit, _ int) (map[string]int, error) {
+	return nil, errNoCUDA
+}
+
+func evolve(_ *Sim, _ *ir.Circuit) error {
+	return errNoCUDA
+}
+
+func stateVector(_ *Sim) []complex128 {
+	return nil
+}
+
+func closeSim(_ *Sim) error {
+	return nil
+}
diff --git a/sim/gpu/cuda/measure.go b/sim/gpu/cuda/measure.go
new file mode 100644
index 0000000..758e562
--- /dev/null
+++ b/sim/gpu/cuda/measure.go
@@ -0,0 +1,197 @@
+//go:build cuda
+
+package cuda
+
+/*
+#include <custatevec.h>
+
+// goAbs2Sum computes probabilities on Z-basis.
+static custatevecStatus_t goAbs2Sum(
+    custatevecHandle_t handle,
+    const void *sv, cudaDataType_t svDataType, int nQubits,
+    double *abs2sum0, double *abs2sum1,
+    const int32_t *bitOrdering, int bitOrderingLen
+) {
+    return custatevecAbs2SumOnZBasis(
+        handle, sv, svDataType, (uint32_t)nQubits,
+        abs2sum0, abs2sum1,
+        bitOrdering, (uint32_t)bitOrderingLen
+    );
+}
+
+// goSamplerCreate creates a batched sampler.
+static custatevecStatus_t goSamplerCreate(
+    custatevecHandle_t handle,
+    const void *sv, cudaDataType_t svDataType, int nQubits,
+    custatevecSamplerDescriptor_t *sampler,
+    int nMaxShots,
+    void *workspace, size_t workspaceSize
+) {
+    return custatevecSamplerCreate(
+        handle, sv, svDataType, (uint32_t)nQubits,
+        sampler, (uint32_t)nMaxShots,
+        workspace, workspaceSize
+    );
+}
+
+// goSamplerGetWorkspaceSize gets required workspace.
+static custatevecStatus_t goSamplerGetWorkspaceSize(
+    custatevecHandle_t handle,
+    cudaDataType_t svDataType, int nQubits,
+    int nMaxShots,
+    size_t *workspaceSize
+) {
+    return custatevecSamplerPreprocess(
+        handle, NULL, svDataType, (uint32_t)nQubits,
+        NULL, (uint32_t)nMaxShots,
+        NULL, workspaceSize
+    );
+}
+
+// goSamplerSample draws samples.
+static custatevecStatus_t goSamplerSample(
+    custatevecSamplerDescriptor_t sampler,
+    custatevecIndex_t *bitStrings,
+    const int32_t *bitOrdering, int bitOrderingLen,
+    const double *randnums, int nShots,
+    enum custatevecSamplerOutput_t output
+) {
+    return custatevecSamplerSample(
+        sampler, bitStrings,
+        bitOrdering, (uint32_t)bitOrderingLen,
+        randnums, (uint32_t)nShots, output
+    );
+}
+
+// goSamplerDestroy destroys the sampler.
+static custatevecStatus_t goSamplerDestroy(custatevecSamplerDescriptor_t sampler) {
+    return custatevecSamplerDestroy(sampler);
+}
+
+// goSamplerPreprocess preprocesses the statevector for sampling.
+static custatevecStatus_t goSamplerPreprocess(
+    custatevecHandle_t handle,
+    custatevecSamplerDescriptor_t sampler,
+    cudaDataType_t svDataType, int nQubits,
+    void *workspace, size_t workspaceSize
+) {
+    size_t ws = workspaceSize;
+    return custatevecSamplerPreprocess(
+        handle, sampler, svDataType, (uint32_t)nQubits,
+        workspace, 0,
+        workspace, &ws
+    );
+}
+*/
+import "C"
+import (
+	"fmt"
+	"math/rand/v2"
+	"unsafe"
+
+	"github.com/splch/goqu/circuit/ir"
+)
+
+// run executes the circuit and samples measurement results on the GPU.
+func run(s *Sim, c *ir.Circuit, shots int) (map[string]int, error) {
+	if c.IsDynamic() {
+		return nil, fmt.Errorf("cuda: dynamic circuits not supported")
+	}
+	if err := evolve(s, c); err != nil {
+		return nil, err
+	}
+	if shots <= 0 {
+		return make(map[string]int), nil
+	}
+
+	// Build bit ordering: [0, 1, ..., nQubits-1].
+	bitOrdering := make([]C.int32_t, s.numQubits)
+	for i := range s.numQubits {
+		bitOrdering[i] = C.int32_t(i)
+	}
+
+	// Create sampler.
+	var sampler C.custatevecSamplerDescriptor_t
+	var wsSize C.size_t
+
+	// Get workspace size.
+	st := C.custatevecSamplerPreprocess(
+		s.handle.h, nil, C.CUDA_C_64F, C.uint32_t(s.numQubits),
+		nil, C.uint32_t(shots),
+		nil, &wsSize,
+	)
+	if st != C.CUSTATEVEC_STATUS_SUCCESS {
+		return nil, fmt.Errorf("custatevecSamplerPreprocess (size) failed: status %d", int(st))
+	}
+
+	// Allocate workspace.
+	var wsPtr unsafe.Pointer
+	if wsSize > 0 {
+		if cst := C.cudaMalloc(&wsPtr, wsSize); cst != C.cudaSuccess {
+			return nil, fmt.Errorf("cudaMalloc sampler workspace failed: status %d", int(cst))
+		}
+		defer C.cudaFree(wsPtr)
+	}
+
+	// Create the sampler.
+	st = C.goSamplerCreate(
+		s.handle.h,
+		s.devicePtr.ptr, C.CUDA_C_64F, C.int(s.numQubits),
+		&sampler, C.int(shots),
+		wsPtr, wsSize,
+	)
+	if st != C.CUSTATEVEC_STATUS_SUCCESS {
+		return nil, fmt.Errorf("custatevecSamplerCreate failed: status %d", int(st))
+	}
+	defer C.goSamplerDestroy(sampler)
+
+	// Preprocess.
+	st = C.goSamplerPreprocess(
+		s.handle.h, sampler,
+		C.CUDA_C_64F, C.int(s.numQubits),
+		wsPtr, wsSize,
+	)
+	if st != C.CUSTATEVEC_STATUS_SUCCESS {
+		return nil, fmt.Errorf("custatevecSamplerPreprocess failed: status %d", int(st))
+	}
+
+	// Generate random numbers.
+	rng := rand.New(rand.NewPCG(rand.Uint64(), rand.Uint64()))
+	randNums := make([]C.double, shots)
+	for i := range shots {
+		randNums[i] = C.double(rng.Float64())
+	}
+
+	// Sample.
+	bitStrings := make([]C.custatevecIndex_t, shots)
+	st = C.goSamplerSample(
+		sampler,
+		&bitStrings[0],
+		&bitOrdering[0], C.int(s.numQubits),
+		&randNums[0], C.int(shots),
+		C.CUSTATEVEC_SAMPLER_OUTPUT_RANDNUM_ORDER,
+	)
+	if st != C.CUSTATEVEC_STATUS_SUCCESS {
+		return nil, fmt.Errorf("custatevecSamplerSample failed: status %d", int(st))
+	}
+
+	// Convert to bitstring counts.
+	counts := make(map[string]int)
+	for _, idx := range bitStrings {
+		bs := formatBitstring(int(idx), s.numQubits)
+		counts[bs]++
+	}
+	return counts, nil
+}
+
+func formatBitstring(idx, n int) string {
+	bs := make([]byte, n)
+	for i := range n {
+		if idx&(1<<i) != 0 {
+			bs[n-1-i] = '1'
+		} else {
+			bs[n-1-i] = '0'
+		}
+	}
+	return string(bs)
+}
diff --git a/sim/gpu/cuda/memory.go b/sim/gpu/cuda/memory.go
new file mode 100644
index 0000000..5701fbb
--- /dev/null
+++ b/sim/gpu/cuda/memory.go
@@ -0,0 +1,86 @@
+//go:build cuda
+
+package cuda
+
+/*
+#include <cuda_runtime.h>
+#include <string.h>
+
+static cudaError_t goMalloc(void **ptr, size_t size) {
+    return cudaMalloc(ptr, size);
+}
+
+static cudaError_t goFree(void *ptr) {
+    return cudaFree(ptr);
+}
+
+static cudaError_t goMemcpyH2D(void *dst, const void *src, size_t size) {
+    return cudaMemcpy(dst, src, size, cudaMemcpyHostToDevice);
+}
+
+static cudaError_t goMemcpyD2H(void *dst, const void *src, size_t size) {
+    return cudaMemcpy(dst, src, size, cudaMemcpyDeviceToHost);
+}
+*/
+import "C"
+import (
+	"fmt"
+	"unsafe"
+)
+
+// allocDevice allocates GPU memory for 2^numQubits complex128 values
+// and initializes to |0...0>.
+func allocDevice(numQubits int) (deviceAlloc, error) {
+	n := 1 << numQubits
+	byteSize := C.size_t(n * 16) // complex128 = 16 bytes
+
+	var d deviceAlloc
+	d.size = n
+
+	if st := C.goMalloc(&d.ptr, byteSize); st != C.cudaSuccess {
+		return d, fmt.Errorf("cudaMalloc failed: status %d", int(st))
+	}
+
+	// Zero the device memory, then set |0> = 1+0i.
+	if st := C.cudaMemset(d.ptr, 0, byteSize); st != C.cudaSuccess {
+		C.goFree(d.ptr)
+		return d, fmt.Errorf("cudaMemset failed: status %d", int(st))
+	}
+
+	// Set amplitude[0] = 1+0i on the host side, then copy.
+	init := complex(1.0, 0.0)
+	if st := C.goMemcpyH2D(d.ptr, unsafe.Pointer(&init), 16); st != C.cudaSuccess {
+		C.goFree(d.ptr)
+		return d, fmt.Errorf("cudaMemcpy H2D failed: status %d", int(st))
+	}
+
+	return d, nil
+}
+
+// resetDevice re-initializes the state vector to |0...0>.
+func resetDevice(d deviceAlloc) error {
+	byteSize := C.size_t(d.size * 16)
+	if st := C.cudaMemset(d.ptr, 0, byteSize); st != C.cudaSuccess {
+		return fmt.Errorf("cudaMemset failed: status %d", int(st))
+	}
+	init := complex(1.0, 0.0)
+	if st := C.goMemcpyH2D(d.ptr, unsafe.Pointer(&init), 16); st != C.cudaSuccess {
+		return fmt.Errorf("cudaMemcpy H2D failed: status %d", int(st))
+	}
+	return nil
+}
+
+// copyToHost transfers the full state vector from GPU to a host slice.
+func copyToHost(d deviceAlloc) []complex128 {
+	out := make([]complex128, d.size)
+	C.goMemcpyD2H(unsafe.Pointer(&out[0]), d.ptr, C.size_t(d.size*16))
+	return out
+}
+
+// freeDevice releases GPU memory.
+func freeDevice(d *deviceAlloc) {
+	if d.ptr != nil {
+		C.goFree(d.ptr)
+		d.ptr = nil
+	}
+}
diff --git a/sim/gpu/cuda/sim.go b/sim/gpu/cuda/sim.go
new file mode 100644
index 0000000..a16d628
--- /dev/null
+++ b/sim/gpu/cuda/sim.go
@@ -0,0 +1,59 @@
+// Package cuda provides a GPU-accelerated statevector simulator using NVIDIA cuStateVec.
+//
+// The simulator satisfies [sim.Simulator] and can be used as a drop-in replacement
+// for the CPU statevector simulator via [backend/local.WithSimulator]:
+//
+//	import gpucuda "github.com/splch/goqu/sim/gpu/cuda"
+//	local.New(local.WithSimulator(gpucuda.NewSimulator))
+//
+// Build requires the cuda build tag and a working cuStateVec installation.
+// Without the tag, stub implementations return an error.
+package cuda
+
+import (
+	"github.com/splch/goqu/circuit/ir"
+	"github.com/splch/goqu/sim"
+)
+
+var _ sim.Simulator = (*Sim)(nil)
+
+// Sim is a GPU-accelerated statevector simulator backed by cuStateVec.
+// The state vector lives on the GPU for its entire lifetime; only gate matrices
+// (16-128 bytes each) are transferred per operation.
+type Sim struct {
+	numQubits int
+	handle    cusvHandle  // cuStateVec handle + CUDA stream
+	devicePtr deviceAlloc // GPU memory for the state vector
+}
+
+// NewSimulator is a [sim.SimFactory] that creates a GPU simulator.
+// It can be passed directly to [backend/local.WithSimulator].
+func NewSimulator(numQubits int) (sim.Simulator, error) {
+	return New(numQubits)
+}
+
+// New creates a GPU simulator initialized to |0...0>.
+// Returns an error if CUDA or cuStateVec initialization fails.
+func New(numQubits int) (*Sim, error) {
+	return newSim(numQubits)
+}
+
+// Run executes the circuit for the given number of shots and returns measurement counts.
+func (s *Sim) Run(c *ir.Circuit, shots int) (map[string]int, error) {
+	return run(s, c, shots)
+}
+
+// Evolve applies all gate operations without measuring.
+func (s *Sim) Evolve(c *ir.Circuit) error {
+	return evolve(s, c)
+}
+
+// StateVector copies the state vector from GPU to host and returns it.
+func (s *Sim) StateVector() []complex128 {
+	return stateVector(s)
+}
+
+// Close releases the cuStateVec handle and frees GPU memory.
+func (s *Sim) Close() error {
+	return closeSim(s)
+}
diff --git a/sim/gpu/cuda/sim_test.go b/sim/gpu/cuda/sim_test.go
new file mode 100644
index 0000000..0901403
--- /dev/null
+++ b/sim/gpu/cuda/sim_test.go
@@ -0,0 +1,269 @@
+//go:build cuda
+
+package cuda
+
+import (
+	"math"
+	"math/cmplx"
+	"testing"
+
+	"github.com/splch/goqu/circuit/builder"
+	"github.com/splch/goqu/circuit/gate"
+	"github.com/splch/goqu/sim/statevector"
+)
+
+const eps = 1e-10
+
+func TestBellState(t *testing.T) {
+	c, err := builder.New("bell", 2).
+		H(0).
+		CNOT(0, 1).
+		Build()
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	sim, err := New(2)
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer sim.Close()
+
+	if err := sim.Evolve(c); err != nil {
+		t.Fatal(err)
+	}
+
+	sv := sim.StateVector()
+	s2 := 1.0 / math.Sqrt2
+	want := []complex128{complex(s2, 0), 0, 0, complex(s2, 0)}
+	assertStateClose(t, sv, want)
+}
+
+func TestGHZ3(t *testing.T) {
+	c, err := builder.New("ghz3", 3).
+		H(0).
+		CNOT(0, 1).
+		CNOT(1, 2).
+		Build()
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	sim, err := New(3)
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer sim.Close()
+
+	if err := sim.Evolve(c); err != nil {
+		t.Fatal(err)
+	}
+
+	sv := sim.StateVector()
+	s2 := 1.0 / math.Sqrt2
+	want := []complex128{complex(s2, 0), 0, 0, 0, 0, 0, 0, complex(s2, 0)}
+	assertStateClose(t, sv, want)
+}
+
+func TestSingleX(t *testing.T) {
+	c, err := builder.New("x", 1).X(0).Build()
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	sim, err := New(1)
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer sim.Close()
+
+	if err := sim.Evolve(c); err != nil {
+		t.Fatal(err)
+	}
+
+	sv := sim.StateVector()
+	want := []complex128{0, 1}
+	assertStateClose(t, sv, want)
+}
+
+func TestMeasurementCounts(t *testing.T) {
+	c, err := builder.New("bell", 2).
+		H(0).
+		CNOT(0, 1).
+		MeasureAll().
+		Build()
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	sim, err := New(2)
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer sim.Close()
+
+	counts, err := sim.Run(c, 10000)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	for k := range counts {
+		if k != "00" && k != "11" {
+			t.Errorf("unexpected measurement outcome: %q", k)
+		}
+	}
+	c00 := counts["00"]
+	c11 := counts["11"]
+	if c00 < 4000 || c00 > 6000 {
+		t.Errorf("counts[00] = %d, expected ~5000", c00)
+	}
+	if c11 < 4000 || c11 > 6000 {
+		t.Errorf("counts[11] = %d, expected ~5000", c11)
+	}
+}
+
+func TestSWAP(t *testing.T) {
+	c, err := builder.New("swap", 2).
+		X(0).
+		SWAP(0, 1).
+		Build()
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	sim, err := New(2)
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer sim.Close()
+
+	if err := sim.Evolve(c); err != nil {
+		t.Fatal(err)
+	}
+
+	sv := sim.StateVector()
+	want := []complex128{0, 0, 1, 0}
+	assertStateClose(t, sv, want)
+}
+
+func TestCCX_Toffoli(t *testing.T) {
+	c, err := builder.New("toffoli", 3).
+		X(1).
+		X(2).
+		CCX(2, 1, 0).
+		Build()
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	sim, err := New(3)
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer sim.Close()
+
+	if err := sim.Evolve(c); err != nil {
+		t.Fatal(err)
+	}
+
+	sv := sim.StateVector()
+	for i, v := range sv {
+		if i == 7 {
+			if cmplx.Abs(v-1) > eps {
+				t.Errorf("sv[7] = %v, want 1", v)
+			}
+		} else {
+			if cmplx.Abs(v) > eps {
+				t.Errorf("sv[%d] = %v, want 0", i, v)
+			}
+		}
+	}
+}
+
+// TestCrossValidation runs random circuits on both CPU and GPU, asserting state vectors match.
+func TestCrossValidation(t *testing.T) {
+	gates1Q := []func(int) *builder.Builder{
+		func(q int) *builder.Builder { return builder.New("", 4).H(q) },
+		func(q int) *builder.Builder { return builder.New("", 4).X(q) },
+		func(q int) *builder.Builder { return builder.New("", 4).Y(q) },
+		func(q int) *builder.Builder { return builder.New("", 4).Z(q) },
+		func(q int) *builder.Builder { return builder.New("", 4).S(q) },
+		func(q int) *builder.Builder { return builder.New("", 4).T(q) },
+	}
+
+	for i, gf := range gates1Q {
+		bld := gf(i % 4)
+		bld.CNOT(0, 1)
+		bld.H(2)
+		bld.CNOT(2, 3)
+
+		c, err := bld.Build()
+		if err != nil {
+			t.Fatal(err)
+		}
+
+		cpuSim := statevector.New(4)
+		if err := cpuSim.Evolve(c); err != nil {
+			t.Fatal(err)
+		}
+		cpuSV := cpuSim.StateVector()
+
+		gpuSim, err := New(4)
+		if err != nil {
+			t.Fatal(err)
+		}
+		if err := gpuSim.Evolve(c); err != nil {
+			gpuSim.Close()
+			t.Fatal(err)
+		}
+		gpuSV := gpuSim.StateVector()
+		gpuSim.Close()
+
+		assertStateClose(t, gpuSV, cpuSV)
+	}
+}
+
+// TestQFT3 verifies QFT on |000> produces uniform superposition.
+func TestQFT3(t *testing.T) {
+	c, err := builder.New("qft3", 3).
+		H(0).
+		Apply(gate.CP(math.Pi/2), 1, 0).
+		Apply(gate.CP(math.Pi/4), 2, 0).
+		H(1).
+		Apply(gate.CP(math.Pi/2), 2, 1).
+		H(2).
+		Build()
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	sim, err := New(3)
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer sim.Close()
+
+	if err := sim.Evolve(c); err != nil {
+		t.Fatal(err)
+	}
+
+	sv := sim.StateVector()
+	amp := 1.0 / math.Sqrt(8)
+	for i, v := range sv {
+		if math.Abs(cmplx.Abs(v)-amp) > eps {
+			t.Errorf("|sv[%d]| = %f, want %f", i, cmplx.Abs(v), amp)
+		}
+	}
+}
+
+func assertStateClose(t *testing.T, got, want []complex128) {
+	t.Helper()
+	if len(got) != len(want) {
+		t.Fatalf("state size %d, want %d", len(got), len(want))
+	}
+	for i := range got {
+		if cmplx.Abs(got[i]-want[i]) > eps {
+			t.Errorf("state[%d] = %v, want %v", i, got[i], want[i])
+		}
+	}
+}
diff --git a/sim/gpu/metal/device.go b/sim/gpu/metal/device.go
new file mode 100644
index 0000000..d0a14ee
--- /dev/null
+++ b/sim/gpu/metal/device.go
@@ -0,0 +1,33 @@
+//go:build !darwin
+
+package metal
+
+import (
+	"fmt"
+
+	"github.com/splch/goqu/circuit/ir"
+)
+
+type metalDevice struct{}
+
+var errNoMetal = fmt.Errorf("metal: not available (build with -tags metal)")
+
+func newSim(_ int) (*Sim, error) {
+	return nil, errNoMetal
+}
+
+func run(_ *Sim, _ *ir.Circuit, _ int) (map[string]int, error) {
+	return nil, errNoMetal
+}
+
+func evolve(_ *Sim, _ *ir.Circuit) error {
+	return errNoMetal
+}
+
+func stateVector(_ *Sim) []complex128 {
+	return nil
+}
+
+func closeSim(_ *Sim) error {
+	return nil
+}
diff --git a/sim/gpu/metal/device_darwin.go b/sim/gpu/metal/device_darwin.go
new file mode 100644
index 0000000..332b7cc
--- /dev/null
+++ b/sim/gpu/metal/device_darwin.go
@@ -0,0 +1,127 @@
+//go:build darwin
+
+package metal
+
+/*
+#cgo CFLAGS: -fno-objc-arc
+#cgo LDFLAGS: -framework Metal -framework Foundation
+#include "metal_bridge.h"
+#include <stdlib.h>
+*/
+import "C"
+import (
+	"fmt"
+	"unsafe"
+)
+
+type metalDevice struct {
+	sim *C.MetalSim
+}
+
+func newSim(numQubits int) (*Sim, error) {
+	var errStr *C.char
+	msim := C.MetalCreate(C.int(numQubits), &errStr)
+	if msim == nil {
+		msg := C.GoString(errStr)
+		C.free(unsafe.Pointer(errStr))
+		return nil, fmt.Errorf("metal: %s", msg)
+	}
+	return &Sim{
+		numQubits: numQubits,
+		device:    metalDevice{sim: msim},
+	}, nil
+}
+
+func closeSim(s *Sim) error {
+	if s.device.sim != nil {
+		C.MetalDestroy(s.device.sim)
+		s.device.sim = nil
+	}
+	return nil
+}
+
+// stateVector returns a copy of the GPU state as []complex128 (converting float32→float64).
+func stateVector(s *Sim) []complex128 {
+	nAmps := 1 << s.numQubits
+	out := make([]complex128, nAmps)
+	ptr := C.MetalStatePtr(s.device.sim)
+	src := unsafe.Slice((*float32)(unsafe.Pointer(ptr)), nAmps*2)
+	for i := range nAmps {
+		out[i] = complex(float64(src[2*i]), float64(src[2*i+1]))
+	}
+	return out
+}
+
+// stateVectorF32 returns a slice of float32 pairs backed by the shared Metal buffer.
+func stateVectorF32(s *Sim) []float32 {
+	nAmps := 1 << s.numQubits
+	ptr := C.MetalStatePtr(s.device.sim)
+	return unsafe.Slice((*float32)(unsafe.Pointer(ptr)), nAmps*2)
+}
+
+// --- Go wrappers for C bridge functions ---
+
+func metalResetState(s *Sim) {
+	C.MetalResetState(s.device.sim)
+}
+
+func metalBeginPass(s *Sim) error {
+	var errStr *C.char
+	if C.MetalBeginPass(s.device.sim, &errStr) != 0 {
+		msg := C.GoString(errStr)
+		C.free(unsafe.Pointer(errStr))
+		return fmt.Errorf("metal: %s", msg)
+	}
+	return nil
+}
+
+func metalEndPass(s *Sim) error {
+	var errStr *C.char
+	if C.MetalEndPass(s.device.sim, &errStr) != 0 {
+		msg := C.GoString(errStr)
+		C.free(unsafe.Pointer(errStr))
+		return fmt.Errorf("metal: %s", msg)
+	}
+	return nil
+}
+
+// metalGate1Q dispatches a 1Q gate. m is a 2x2 complex128 matrix (4 elements).
+func metalGate1Q(s *Sim, qubit int, m []complex128) error {
+	var fm [8]float32
+	for i, c := range m {
+		fm[2*i] = float32(real(c))
+		fm[2*i+1] = float32(imag(c))
+	}
+	var errStr *C.char
+	if C.MetalGate1Q(s.device.sim, C.uint32_t(qubit), (*C.float)(&fm[0]), &errStr) != 0 {
+		msg := C.GoString(errStr)
+		C.free(unsafe.Pointer(errStr))
+		return fmt.Errorf("metal: %s", msg)
+	}
+	return nil
+}
+
+// metalGate2Q dispatches a 2Q gate. m is a 4x4 complex128 matrix (16 elements).
+func metalGate2Q(s *Sim, q0, q1 int, m []complex128) error {
+	var fm [32]float32
+	for i, c := range m {
+		fm[2*i] = float32(real(c))
+		fm[2*i+1] = float32(imag(c))
+	}
+	var errStr *C.char
+	if C.MetalGate2Q(s.device.sim, C.uint32_t(q0), C.uint32_t(q1), (*C.float)(&fm[0]), &errStr) != 0 {
+		msg := C.GoString(errStr)
+		C.free(unsafe.Pointer(errStr))
+		return fmt.Errorf("metal: %s", msg)
+	}
+	return nil
+}
+
+// writeStateF32 writes amplitudes to the shared buffer (converting float64→float32).
+func writeStateF32(s *Sim, amps []complex128) {
+	buf := stateVectorF32(s)
+	for i, a := range amps {
+		buf[2*i] = float32(real(a))
+		buf[2*i+1] = float32(imag(a))
+	}
+}
diff --git a/sim/gpu/metal/go.mod b/sim/gpu/metal/go.mod
new file mode 100644
index 0000000..ad09279
--- /dev/null
+++ b/sim/gpu/metal/go.mod
@@ -0,0 +1,7 @@
+module github.com/splch/goqu/sim/gpu/metal
+
+go 1.24
+
+require github.com/splch/goqu v0.0.0
+
+replace github.com/splch/goqu => ../../../
diff --git a/sim/gpu/metal/kernels_darwin.go b/sim/gpu/metal/kernels_darwin.go
new file mode 100644
index 0000000..c95faba
--- /dev/null
+++ b/sim/gpu/metal/kernels_darwin.go
@@ -0,0 +1,228 @@
+//go:build darwin
+
+package metal
+
+import (
+	"fmt"
+	"math"
+	"math/rand/v2"
+
+	"github.com/splch/goqu/circuit/gate"
+	"github.com/splch/goqu/circuit/ir"
+)
+
+func evolve(s *Sim, c *ir.Circuit) error {
+	if c.NumQubits() != s.numQubits {
+		return fmt.Errorf("metal: circuit has %d qubits, simulator has %d",
+			c.NumQubits(), s.numQubits)
+	}
+
+	metalResetState(s)
+
+	if err := metalBeginPass(s); err != nil {
+		return err
+	}
+	passOpen := true
+
+	endPass := func() error {
+		if passOpen {
+			passOpen = false
+			return metalEndPass(s)
+		}
+		return nil
+	}
+	beginPass := func() error {
+		if !passOpen {
+			if err := metalBeginPass(s); err != nil {
+				return err
+			}
+			passOpen = true
+		}
+		return nil
+	}
+
+	for _, op := range c.Ops() {
+		if op.Gate == nil || op.Gate.Name() == "barrier" {
+			continue
+		}
+
+		if op.Gate.Name() == "reset" {
+			if err := endPass(); err != nil {
+				return err
+			}
+			resetQubitCPU(s, op.Qubits[0])
+			if err := beginPass(); err != nil {
+				return err
+			}
+			continue
+		}
+
+		if sp, ok := op.Gate.(gate.StatePrepable); ok {
+			amps := sp.Amplitudes()
+			if len(op.Qubits) == s.numQubits {
+				allInOrder := true
+				for i, q := range op.Qubits {
+					if q != i {
+						allInOrder = false
+						break
+					}
+				}
+				if allInOrder {
+					if err := endPass(); err != nil {
+						return err
+					}
+					writeStateF32(s, amps)
+					if err := beginPass(); err != nil {
+						return err
+					}
+					continue
+				}
+			}
+			applied := op.Gate.Decompose(op.Qubits)
+			for _, a := range applied {
+				m := a.Gate.Matrix()
+				if m == nil {
+					continue
+				}
+				if err := dispatchOp(s, a.Gate, a.Qubits); err != nil {
+					_ = endPass()
+					return err
+				}
+			}
+			continue
+		}
+
+		if err := dispatchOp(s, op.Gate, op.Qubits); err != nil {
+			_ = endPass()
+			return err
+		}
+	}
+
+	return endPass()
+}
+
+func dispatchOp(s *Sim, g gate.Gate, qubits []int) error {
+	switch g.Qubits() {
+	case 1:
+		return metalGate1Q(s, qubits[0], g.Matrix())
+	case 2:
+		return dispatchGate2(s, qubits[0], qubits[1], g.Matrix())
+	default:
+		applied := g.Decompose(qubits)
+		if applied != nil {
+			for _, a := range applied {
+				m := a.Gate.Matrix()
+				if m == nil {
+					continue
+				}
+				if err := dispatchOp(s, a.Gate, a.Qubits); err != nil {
+					return err
+				}
+			}
+			return nil
+		}
+		return fmt.Errorf("metal: unsupported gate %s (%d qubits)",
+			g.Name(), g.Qubits())
+	}
+}
+
+func dispatchGate2(s *Sim, q0, q1 int, m []complex128) error {
+	if q0 < q1 {
+		// CPU convention: row 1 = q1 set, row 2 = q0 set.
+		// Metal convention: a[1] = lower qubit (q0) set, a[2] = higher qubit (q1) set.
+		// When q0 < q1, rows 1 and 2 are swapped — permute the matrix.
+		m = permuteMatrix2Q(m)
+		return metalGate2Q(s, q0, q1, m)
+	}
+	// q0 > q1: swap qubit args for Metal (requires sorted). The CPU and Metal
+	// conventions happen to align when the original q0 is the higher qubit.
+	return metalGate2Q(s, q1, q0, m)
+}
+
+// permuteMatrix2Q reorders a 4x4 gate matrix when swapping qubit order.
+func permuteMatrix2Q(m []complex128) []complex128 {
+	perm := [4]int{0, 2, 1, 3}
+	out := make([]complex128, 16)
+	for r := range 4 {
+		for c := range 4 {
+			out[r*4+c] = m[perm[r]*4+perm[c]]
+		}
+	}
+	return out
+}
+
+func run(s *Sim, c *ir.Circuit, shots int) (map[string]int, error) {
+	if c.IsDynamic() {
+		return nil, fmt.Errorf("metal: dynamic circuits not supported")
+	}
+	if err := evolve(s, c); err != nil {
+		return nil, err
+	}
+
+	// Read float32 probabilities from shared buffer.
+	buf := stateVectorF32(s)
+	nAmps := 1 << s.numQubits
+	probs := make([]float64, nAmps)
+	for i := range nAmps {
+		re := float64(buf[2*i])
+		im := float64(buf[2*i+1])
+		probs[i] = re*re + im*im
+	}
+
+	counts := make(map[string]int)
+	rng := rand.New(rand.NewPCG(rand.Uint64(), rand.Uint64()))
+	for range shots {
+		idx := sampleIndex(probs, rng)
+		bs := formatBitstring(idx, s.numQubits)
+		counts[bs]++
+	}
+	return counts, nil
+}
+
+func resetQubitCPU(s *Sim, qubit int) {
+	buf := stateVectorF32(s)
+	nAmps := 1 << s.numQubits
+	halfBlock := 1 << qubit
+	block := halfBlock << 1
+	for b0 := 0; b0 < nAmps; b0 += block {
+		for offset := range halfBlock {
+			i0 := b0 + offset
+			i1 := i0 + halfBlock
+			r0, im0 := float64(buf[2*i0]), float64(buf[2*i0+1])
+			r1, im1 := float64(buf[2*i1]), float64(buf[2*i1+1])
+			norm := math.Sqrt(r0*r0 + im0*im0 + r1*r1 + im1*im1)
+			if norm > 1e-15 {
+				buf[2*i0] = float32(norm)
+			} else {
+				buf[2*i0] = 0
+			}
+			buf[2*i0+1] = 0
+			buf[2*i1] = 0
+			buf[2*i1+1] = 0
+		}
+	}
+}
+
+func sampleIndex(probs []float64, rng *rand.Rand) int {
+	r := rng.Float64()
+	cum := 0.0
+	for i, p := range probs {
+		cum += p
+		if r < cum {
+			return i
+		}
+	}
+	return len(probs) - 1
+}
+
+func formatBitstring(idx, n int) string {
+	bs := make([]byte, n)
+	for i := range n {
+		if idx&(1<<i) != 0 {
+			bs[n-1-i] = '1'
+		} else {
+			bs[n-1-i] = '0'
+		}
+	}
+	return string(bs)
+}
diff --git a/sim/gpu/metal/metal_bridge.h b/sim/gpu/metal/metal_bridge.h
new file mode 100644
index 0000000..9fa0ec4
--- /dev/null
+++ b/sim/gpu/metal/metal_bridge.h
@@ -0,0 +1,39 @@
+#ifndef METAL_BRIDGE_H
+#define METAL_BRIDGE_H
+
+#include <stdint.h>
+
+typedef struct MetalSim MetalSim;
+
+// MetalCreate creates a Metal simulator for numQubits qubits.
+// Returns NULL on error; *errOut is set to a strdup'd message (caller frees).
+MetalSim* MetalCreate(int numQubits, char** errOut);
+
+// MetalDestroy releases all Metal resources.
+void MetalDestroy(MetalSim* sim);
+
+// MetalStatePtr returns a pointer to the float32 state vector in shared memory.
+// Layout: 2*numAmps floats (pairs of real, imag) — i.e., complex64.
+float* MetalStatePtr(MetalSim* sim);
+
+// MetalNumAmps returns 2^numQubits.
+int MetalNumAmps(MetalSim* sim);
+
+// MetalResetState sets the state to |0...0>.
+void MetalResetState(MetalSim* sim);
+
+// MetalBeginPass creates a new command buffer and compute encoder.
+int MetalBeginPass(MetalSim* sim, char** errOut);
+
+// MetalGate1Q encodes a 1-qubit gate. matrix: 8 floats (4 complex, row-major).
+int MetalGate1Q(MetalSim* sim, uint32_t qubit, const float* matrix, char** errOut);
+
+// MetalGate2Q encodes a 2-qubit gate. qubit0 < qubit1 required.
+// matrix: 32 floats (16 complex, row-major).
+int MetalGate2Q(MetalSim* sim, uint32_t qubit0, uint32_t qubit1,
+                const float* matrix, char** errOut);
+
+// MetalEndPass commits the command buffer and waits for completion.
+int MetalEndPass(MetalSim* sim, char** errOut);
+
+#endif
diff --git a/sim/gpu/metal/metal_bridge.m b/sim/gpu/metal/metal_bridge.m
new file mode 100644
index 0000000..c85d3d7
--- /dev/null
+++ b/sim/gpu/metal/metal_bridge.m
@@ -0,0 +1,359 @@
+#import <Metal/Metal.h>
+#import <Foundation/Foundation.h>
+#include "metal_bridge.h"
+#include <string.h>
+#include <stdlib.h>
+
+#if __has_feature(objc_arc)
+#error "This file must be compiled without ARC (-fno-objc-arc)"
+#endif
+
+// ---------------------------------------------------------------------------
+// Shader source (embedded) — 1Q and 2Q gate kernels using float2 (complex64).
+// ---------------------------------------------------------------------------
+
+static const char *kShaderSource =
+"#include <metal_stdlib>\n"
+"using namespace metal;\n"
+"\n"
+"// ---- 1-qubit gate ----\n"
+"struct Gate1QParams {\n"
+"    uint qubit;\n"
+"    uint nAmps;\n"
+"    float m[8];\n"
+"};\n"
+"\n"
+"inline float2 cmul(float2 a, float2 b) {\n"
+"    return float2(a.x*b.x - a.y*b.y, a.x*b.y + a.y*b.x);\n"
+"}\n"
+"inline float2 cadd(float2 a, float2 b) {\n"
+"    return float2(a.x + b.x, a.y + b.y);\n"
+"}\n"
+"\n"
+"kernel void gate1q(\n"
+"    device float2 *sv [[buffer(0)]],\n"
+"    constant Gate1QParams &params [[buffer(1)]],\n"
+"    uint gid [[thread_position_in_grid]]\n"
+") {\n"
+"    uint halfBlock = 1u << params.qubit;\n"
+"    uint block = halfBlock << 1u;\n"
+"    uint blockIdx = gid / halfBlock;\n"
+"    uint offset   = gid % halfBlock;\n"
+"    uint b0 = blockIdx * block;\n"
+"    uint i0 = b0 + offset;\n"
+"    uint i1 = i0 + halfBlock;\n"
+"    if (i1 >= params.nAmps) return;\n"
+"    float2 a0 = sv[i0];\n"
+"    float2 a1 = sv[i1];\n"
+"    float2 m00 = float2(params.m[0], params.m[1]);\n"
+"    float2 m01 = float2(params.m[2], params.m[3]);\n"
+"    float2 m10 = float2(params.m[4], params.m[5]);\n"
+"    float2 m11 = float2(params.m[6], params.m[7]);\n"
+"    sv[i0] = cadd(cmul(m00, a0), cmul(m01, a1));\n"
+"    sv[i1] = cadd(cmul(m10, a0), cmul(m11, a1));\n"
+"}\n"
+"\n"
+"// ---- 2-qubit gate ----\n"
+"struct Gate2QParams {\n"
+"    uint qubit0;\n"
+"    uint qubit1;\n"
+"    uint nAmps;\n"
+"    float m[32];\n"
+"};\n"
+"\n"
+"kernel void gate2q(\n"
+"    device float2 *sv [[buffer(0)]],\n"
+"    constant Gate2QParams &params [[buffer(1)]],\n"
+"    uint gid [[thread_position_in_grid]]\n"
+") {\n"
+"    uint q0 = params.qubit0;\n"
+"    uint q1 = params.qubit1;\n"
+"    uint idx = gid;\n"
+"    uint lo0 = idx & ((1u << q0) - 1u);\n"
+"    uint hi0 = idx >> q0;\n"
+"    idx = (hi0 << (q0 + 1u)) | lo0;\n"
+"    uint lo1 = idx & ((1u << q1) - 1u);\n"
+"    uint hi1 = idx >> q1;\n"
+"    idx = (hi1 << (q1 + 1u)) | lo1;\n"
+"    uint i00 = idx;\n"
+"    uint i01 = idx | (1u << q0);\n"
+"    uint i10 = idx | (1u << q1);\n"
+"    uint i11 = idx | (1u << q0) | (1u << q1);\n"
+"    if (i11 >= params.nAmps) return;\n"
+"    float2 a[4] = { sv[i00], sv[i01], sv[i10], sv[i11] };\n"
+"    float2 r[4];\n"
+"    for (int row = 0; row < 4; row++) {\n"
+"        r[row] = float2(0.0, 0.0);\n"
+"        for (int col = 0; col < 4; col++) {\n"
+"            float2 mv = float2(params.m[(row*4+col)*2], params.m[(row*4+col)*2+1]);\n"
+"            r[row] = cadd(r[row], cmul(mv, a[col]));\n"
+"        }\n"
+"    }\n"
+"    sv[i00] = r[0];\n"
+"    sv[i01] = r[1];\n"
+"    sv[i10] = r[2];\n"
+"    sv[i11] = r[3];\n"
+"}\n";
+
+// ---------------------------------------------------------------------------
+// C-side parameter structs (must match shader layout exactly).
+// ---------------------------------------------------------------------------
+
+typedef struct {
+    uint32_t qubit;
+    uint32_t nAmps;
+    float m[8];
+} Gate1QParams;
+
+typedef struct {
+    uint32_t qubit0;
+    uint32_t qubit1;
+    uint32_t nAmps;
+    float m[32];
+} Gate2QParams;
+
+// ---------------------------------------------------------------------------
+// MetalSim — opaque handle holding all Metal state.
+// ---------------------------------------------------------------------------
+
+struct MetalSim {
+    int numQubits;
+    int numAmps;
+    id<MTLDevice>                device;
+    id<MTLCommandQueue>          queue;
+    id<MTLComputePipelineState>  pipe1q;
+    id<MTLComputePipelineState>  pipe2q;
+    id<MTLBuffer>                svBuffer;
+    // Transient per-pass state (non-nil between BeginPass/EndPass).
+    id<MTLCommandBuffer>         cmdBuf;
+    id<MTLComputeCommandEncoder> encoder;
+    NSUInteger tgSize1q;
+    NSUInteger tgSize2q;
+};
+
+// ---------------------------------------------------------------------------
+// Helpers
+// ---------------------------------------------------------------------------
+
+static char* errdup(NSError *err) {
+    if (err) {
+        return strdup([[err localizedDescription] UTF8String]);
+    }
+    return strdup("unknown Metal error");
+}
+
+// ---------------------------------------------------------------------------
+// Public API
+// ---------------------------------------------------------------------------
+
+MetalSim* MetalCreate(int numQubits, char** errOut) {
+    @autoreleasepool {
+        if (numQubits < 1) {
+            *errOut = strdup("numQubits must be >= 1");
+            return NULL;
+        }
+
+        id<MTLDevice> device = MTLCreateSystemDefaultDevice();
+        if (!device) {
+            *errOut = strdup("no Metal device available");
+            return NULL;
+        }
+
+        // Compile shaders.
+        NSError *error = nil;
+        NSString *src = [NSString stringWithUTF8String:kShaderSource];
+        id<MTLLibrary> library = [device newLibraryWithSource:src options:nil error:&error];
+        if (!library) {
+            *errOut = errdup(error);
+            [device release];
+            return NULL;
+        }
+
+        id<MTLFunction> func1q = [library newFunctionWithName:@"gate1q"];
+        id<MTLFunction> func2q = [library newFunctionWithName:@"gate2q"];
+        if (!func1q || !func2q) {
+            *errOut = strdup("shader function not found");
+            if (func1q) [func1q release];
+            if (func2q) [func2q release];
+            [library release];
+            [device release];
+            return NULL;
+        }
+
+        id<MTLComputePipelineState> pipe1q =
+            [device newComputePipelineStateWithFunction:func1q error:&error];
+        if (!pipe1q) {
+            *errOut = errdup(error);
+            [func1q release]; [func2q release];
+            [library release]; [device release];
+            return NULL;
+        }
+        id<MTLComputePipelineState> pipe2q =
+            [device newComputePipelineStateWithFunction:func2q error:&error];
+        if (!pipe2q) {
+            *errOut = errdup(error);
+            [pipe1q release]; [func1q release]; [func2q release];
+            [library release]; [device release];
+            return NULL;
+        }
+
+        [func1q release];
+        [func2q release];
+        [library release];
+
+        // Allocate shared-memory state-vector buffer (float2 = 8 bytes per amplitude).
+        int numAmps = 1 << numQubits;
+        NSUInteger bufSize = (NSUInteger)numAmps * 2 * sizeof(float);
+        id<MTLBuffer> svBuffer = [device newBufferWithLength:bufSize
+                                                     options:MTLResourceStorageModeShared];
+        if (!svBuffer) {
+            *errOut = strdup("failed to allocate state vector buffer");
+            [pipe2q release]; [pipe1q release]; [device release];
+            return NULL;
+        }
+
+        // Initialize to |0...0>: first amplitude = (1, 0).
+        float *ptr = (float*)[svBuffer contents];
+        memset(ptr, 0, bufSize);
+        ptr[0] = 1.0f;
+
+        id<MTLCommandQueue> queue = [device newCommandQueue];
+
+        MetalSim *sim = (MetalSim*)calloc(1, sizeof(MetalSim));
+        sim->numQubits = numQubits;
+        sim->numAmps   = numAmps;
+        sim->device    = device;
+        sim->queue     = queue;
+        sim->pipe1q    = pipe1q;
+        sim->pipe2q    = pipe2q;
+        sim->svBuffer  = svBuffer;
+        sim->cmdBuf    = nil;
+        sim->encoder   = nil;
+        sim->tgSize1q  = [pipe1q maxTotalThreadsPerThreadgroup];
+        sim->tgSize2q  = [pipe2q maxTotalThreadsPerThreadgroup];
+        if (sim->tgSize1q > 256) sim->tgSize1q = 256;
+        if (sim->tgSize2q > 256) sim->tgSize2q = 256;
+        return sim;
+    }
+}
+
+void MetalDestroy(MetalSim* sim) {
+    if (!sim) return;
+    @autoreleasepool {
+        if (sim->encoder) { [sim->encoder endEncoding]; [sim->encoder release]; }
+        if (sim->cmdBuf)  { [sim->cmdBuf release]; }
+        [sim->svBuffer release];
+        [sim->pipe2q release];
+        [sim->pipe1q release];
+        [sim->queue release];
+        [sim->device release];
+    }
+    free(sim);
+}
+
+float* MetalStatePtr(MetalSim* sim) {
+    return (float*)[sim->svBuffer contents];
+}
+
+int MetalNumAmps(MetalSim* sim) {
+    return sim->numAmps;
+}
+
+void MetalResetState(MetalSim* sim) {
+    float *ptr = (float*)[sim->svBuffer contents];
+    memset(ptr, 0, (size_t)sim->numAmps * 2 * sizeof(float));
+    ptr[0] = 1.0f;
+}
+
+int MetalBeginPass(MetalSim* sim, char** errOut) {
+    @autoreleasepool {
+        id<MTLCommandBuffer> buf = [sim->queue commandBuffer];
+        if (!buf) {
+            *errOut = strdup("failed to create command buffer");
+            return -1;
+        }
+        sim->cmdBuf = [buf retain];
+
+        id<MTLComputeCommandEncoder> enc = [sim->cmdBuf computeCommandEncoder];
+        if (!enc) {
+            *errOut = strdup("failed to create compute encoder");
+            [sim->cmdBuf release];
+            sim->cmdBuf = nil;
+            return -1;
+        }
+        sim->encoder = [enc retain];
+    }
+    return 0;
+}
+
+int MetalGate1Q(MetalSim* sim, uint32_t qubit, const float* matrix,
+                char** errOut) {
+    (void)errOut;
+    Gate1QParams params;
+    params.qubit = qubit;
+    params.nAmps = (uint32_t)sim->numAmps;
+    memcpy(params.m, matrix, 8 * sizeof(float));
+
+    [sim->encoder setComputePipelineState:sim->pipe1q];
+    [sim->encoder setBuffer:sim->svBuffer offset:0 atIndex:0];
+    [sim->encoder setBytes:&params length:sizeof(params) atIndex:1];
+
+    NSUInteger numThreads = (NSUInteger)sim->numAmps / 2;
+    NSUInteger tgSize = sim->tgSize1q;
+    if (tgSize > numThreads) tgSize = numThreads;
+    if (tgSize < 1) tgSize = 1;
+    NSUInteger numGroups = (numThreads + tgSize - 1) / tgSize;
+
+    [sim->encoder dispatchThreadgroups:MTLSizeMake(numGroups, 1, 1)
+                 threadsPerThreadgroup:MTLSizeMake(tgSize, 1, 1)];
+    [sim->encoder memoryBarrierWithScope:MTLBarrierScopeBuffers];
+    return 0;
+}
+
+int MetalGate2Q(MetalSim* sim, uint32_t qubit0, uint32_t qubit1,
+                const float* matrix, char** errOut) {
+    (void)errOut;
+    Gate2QParams params;
+    params.qubit0 = qubit0;
+    params.qubit1 = qubit1;
+    params.nAmps  = (uint32_t)sim->numAmps;
+    memcpy(params.m, matrix, 32 * sizeof(float));
+
+    [sim->encoder setComputePipelineState:sim->pipe2q];
+    [sim->encoder setBuffer:sim->svBuffer offset:0 atIndex:0];
+    [sim->encoder setBytes:&params length:sizeof(params) atIndex:1];
+
+    NSUInteger numThreads = (NSUInteger)sim->numAmps / 4;
+    NSUInteger tgSize = sim->tgSize2q;
+    if (tgSize > numThreads) tgSize = numThreads;
+    if (tgSize < 1) tgSize = 1;
+    NSUInteger numGroups = (numThreads + tgSize - 1) / tgSize;
+
+    [sim->encoder dispatchThreadgroups:MTLSizeMake(numGroups, 1, 1)
+                 threadsPerThreadgroup:MTLSizeMake(tgSize, 1, 1)];
+    [sim->encoder memoryBarrierWithScope:MTLBarrierScopeBuffers];
+    return 0;
+}
+
+int MetalEndPass(MetalSim* sim, char** errOut) {
+    @autoreleasepool {
+        [sim->encoder endEncoding];
+        [sim->encoder release];
+        sim->encoder = nil;
+
+        [sim->cmdBuf commit];
+        [sim->cmdBuf waitUntilCompleted];
+
+        if ([sim->cmdBuf status] == MTLCommandBufferStatusError) {
+            NSError *error = [sim->cmdBuf error];
+            *errOut = errdup(error);
+            [sim->cmdBuf release];
+            sim->cmdBuf = nil;
+            return -1;
+        }
+
+        [sim->cmdBuf release];
+        sim->cmdBuf = nil;
+    }
+    return 0;
+}
diff --git a/sim/gpu/metal/sim.go b/sim/gpu/metal/sim.go
new file mode 100644
index 0000000..0577f77
--- /dev/null
+++ b/sim/gpu/metal/sim.go
@@ -0,0 +1,55 @@
+// Package metal provides a GPU-accelerated statevector simulator using Apple Metal compute shaders.
+//
+// The simulator satisfies [sim.Simulator] and can be used as a drop-in replacement
+// for the CPU statevector simulator via [backend/local.WithSimulator]:
+//
+//	import gpumetal "github.com/splch/goqu/sim/gpu/metal"
+//	local.New(local.WithSimulator(gpumetal.NewSimulator))
+//
+// Build requires the metal build tag and macOS/Apple Silicon.
+// Without the tag, stub implementations return an error.
+package metal
+
+import (
+	"github.com/splch/goqu/circuit/ir"
+	"github.com/splch/goqu/sim"
+)
+
+var _ sim.Simulator = (*Sim)(nil)
+
+// Sim is a GPU-accelerated statevector simulator backed by Apple Metal compute shaders.
+// On Apple Silicon, unified memory eliminates explicit CPU-GPU transfers.
+type Sim struct {
+	numQubits int
+	device    metalDevice
+}
+
+// NewSimulator is a [sim.SimFactory] that creates a Metal GPU simulator.
+func NewSimulator(numQubits int) (sim.Simulator, error) {
+	return New(numQubits)
+}
+
+// New creates a Metal GPU simulator initialized to |0...0>.
+func New(numQubits int) (*Sim, error) {
+	return newSim(numQubits)
+}
+
+// Run executes the circuit and returns measurement counts.
+func (s *Sim) Run(c *ir.Circuit, shots int) (map[string]int, error) {
+	return run(s, c, shots)
+}
+
+// Evolve applies all gate operations without measuring.
+func (s *Sim) Evolve(c *ir.Circuit) error {
+	return evolve(s, c)
+}
+
+// StateVector returns a copy of the current state vector.
+func (s *Sim) StateVector() []complex128 {
+	return stateVector(s)
+}
+
+// Close releases Metal device resources.
+func (s *Sim) Close() error {
+	return closeSim(s)
+}
diff --git a/sim/gpu/metal/sim_test.go b/sim/gpu/metal/sim_test.go
new file mode 100644
index 0000000..7484a60
--- /dev/null
+++ b/sim/gpu/metal/sim_test.go
@@ -0,0 +1,172 @@
+//go:build darwin
+
+package metal
+
+import (
+	"math"
+	"math/cmplx"
+	"testing"
+
+	"github.com/splch/goqu/circuit/builder"
+	"github.com/splch/goqu/sim/statevector"
+)
+
+// Metal uses float32 precision; allow ~1e-5 tolerance.
+const eps = 1e-5
+
+func TestBellState(t *testing.T) {
+	c, err := builder.New("bell", 2).
+		H(0).
+		CNOT(0, 1).
+		Build()
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	sim, err := New(2)
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer sim.Close()
+
+	if err := sim.Evolve(c); err != nil {
+		t.Fatal(err)
+	}
+
+	sv := sim.StateVector()
+	s2 := 1.0 / math.Sqrt2
+	want := []complex128{complex(s2, 0), 0, 0, complex(s2, 0)}
+	assertStateClose(t, sv, want)
+}
+
+func TestCrossValidation(t *testing.T) {
+	bld := builder.New("cross", 4)
+	bld.H(0)
+	bld.CNOT(0, 1)
+	bld.H(2)
+	bld.CNOT(2, 3)
+	c, err := bld.Build()
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	cpuSim := statevector.New(4)
+	if err := cpuSim.Evolve(c); err != nil {
+		t.Fatal(err)
+	}
+	cpuSV := cpuSim.StateVector()
+
+	gpuSim, err := New(4)
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer gpuSim.Close()
+
+	if err := gpuSim.Evolve(c); err != nil {
+		t.Fatal(err)
+	}
+	gpuSV := gpuSim.StateVector()
+
+	assertStateClose(t, gpuSV, cpuSV)
+}
+
+func TestReversedQubits(t *testing.T) {
+	// CNOT with control > target to test qubit-swap path.
+	c, err := builder.New("rev", 3).
+		H(2).
+		CNOT(2, 0).
+		Build()
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	cpuSim := statevector.New(3)
+	if err := cpuSim.Evolve(c); err != nil {
+		t.Fatal(err)
+	}
+
+	gpuSim, err := New(3)
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer gpuSim.Close()
+
+	if err := gpuSim.Evolve(c); err != nil {
+		t.Fatal(err)
+	}
+
+	assertStateClose(t, gpuSim.StateVector(), cpuSim.StateVector())
+}
+
+func TestRun(t *testing.T) {
+	c, err := builder.New("bell", 2).
+		H(0).
+		CNOT(0, 1).
+		Build()
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	sim, err := New(2)
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer sim.Close()
+
+	counts, err := sim.Run(c, 1000)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	// Bell state should only produce "00" and "11".
+	for bs := range counts {
+		if bs != "00" && bs != "11" {
+			t.Errorf("unexpected bitstring %q", bs)
+		}
+	}
+	if counts["00"]+counts["11"] != 1000 {
+		t.Errorf("total shots = %d, want 1000", counts["00"]+counts["11"])
+	}
+}
+
+func TestGHZ(t *testing.T) {
+	// 5-qubit GHZ state: (|00000> + |11111>) / sqrt(2)
+	bld := builder.New("ghz", 5)
+	bld.H(0)
+	for i := range 4 {
+		bld.CNOT(i, i+1)
+	}
+	c, err := bld.Build()
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	cpuSim := statevector.New(5)
+	if err := cpuSim.Evolve(c); err != nil {
+		t.Fatal(err)
+	}
+
+	gpuSim, err := New(5)
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer gpuSim.Close()
+
+	if err := gpuSim.Evolve(c); err != nil {
+		t.Fatal(err)
+	}
+
+	assertStateClose(t, gpuSim.StateVector(), cpuSim.StateVector())
+}
+
+func assertStateClose(t *testing.T, got, want []complex128) {
+	t.Helper()
+	if len(got) != len(want) {
+		t.Fatalf("state size %d, want %d", len(got), len(want))
+	}
+	for i := range got {
+		if cmplx.Abs(got[i]-want[i]) > eps {
+			t.Errorf("state[%d] = %v, want %v", i, got[i], want[i])
+		}
+	}
+}
diff --git a/sim/sim.go b/sim/sim.go
new file mode 100644
index 0000000..72367a5
--- /dev/null
+++ b/sim/sim.go
@@ -0,0 +1,22 @@
+// Package sim defines the Simulator interface satisfied by all backends
+// (CPU statevector, density matrix, GPU, etc.).
+package sim
+
+import "github.com/splch/goqu/circuit/ir"
+
+// Simulator is the common interface for quantum circuit simulators.
+// Both CPU and GPU implementations satisfy this interface.
+type Simulator interface {
+	// Run executes the circuit for the given number of shots and returns measurement counts.
+	Run(c *ir.Circuit, shots int) (map[string]int, error)
+
+	// Evolve applies all gate operations without measuring, leaving the state accessible.
+	Evolve(c *ir.Circuit) error
+
+	// StateVector returns a copy of the current state vector.
+	StateVector() []complex128
+
+	// Close releases any resources held by the simulator.
+	// For CPU simulators this is a no-op; GPU simulators free device memory.
+	Close() error
+}
diff --git a/sim/statevector/sim.go b/sim/statevector/sim.go
index 58ef253..4f017ce 100644
--- a/sim/statevector/sim.go
+++ b/sim/statevector/sim.go
@@ -11,9 +11,12 @@ import (
 
 	"github.com/splch/goqu/circuit/gate"
 	"github.com/splch/goqu/circuit/ir"
+	"github.com/splch/goqu/sim"
 	"github.com/splch/goqu/sim/pauli"
 )
 
+var _ sim.Simulator = (*Sim)(nil)
+
 // Sim simulates a circuit via full statevector evolution.
 type Sim struct {
 	numQubits int
@@ -61,6 +64,9 @@ func (s *Sim) StateVector() []complex128 {
 	return out
 }
 
+// Close is a no-op for the CPU statevector simulator. It satisfies sim.Simulator.
+func (s *Sim) Close() error { return nil }
+
 // Evolve applies all gate operations without measuring, leaving the statevector accessible.
 func (s *Sim) Evolve(c *ir.Circuit) error {
 	if c.NumQubits() != s.numQubits {