diff --git a/Makefile b/Makefile index 504a9a8..ad1a428 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -.PHONY: test test-race test-all lint vet fuzz bench coverage clean hooks +.PHONY: test test-race test-all lint vet fuzz bench bench-gpu-cuda test-gpu-cuda coverage clean hooks test: go test -count=1 -timeout=5m ./... @@ -10,6 +10,8 @@ test-all: test-race cd backend/braket && go test -race -count=1 -timeout=5m ./... cd observe/otelbridge && go build ./... cd observe/prombridge && go build ./... + cd sim/gpu/cuda && go build ./... + cd sim/gpu/metal && go build ./... lint: golangci-lint run ./... @@ -20,6 +22,8 @@ vet: cd backend/braket && go vet ./... cd observe/otelbridge && go vet ./... cd observe/prombridge && go vet ./... + cd sim/gpu/cuda && go vet ./... + cd sim/gpu/metal && go vet ./... fuzz: go test ./qasm/parser -run=^$$ -fuzz=FuzzParse -fuzztime=30s @@ -41,6 +45,12 @@ bench: go test ./sim/statevector/ -bench=. -count=5 -benchmem -run=^$$ -timeout=10m go test ./sim/densitymatrix/ -bench=. -count=5 -benchmem -run=^$$ -timeout=10m +test-gpu-cuda: + cd sim/gpu/cuda && go test -tags cuda -count=1 -timeout=5m ./... + +bench-gpu-cuda: + cd sim/gpu/cuda && go test -tags cuda -bench=. -count=5 -benchmem -run=^$$ -timeout=10m + coverage: go test -coverprofile=coverage.out ./... go tool cover -html=coverage.out -o coverage.html diff --git a/backend/local/local.go b/backend/local/local.go index 1567aa1..1198d7c 100644 --- a/backend/local/local.go +++ b/backend/local/local.go @@ -12,6 +12,7 @@ import ( "github.com/splch/goqu/backend" "github.com/splch/goqu/observe" + "github.com/splch/goqu/sim" "github.com/splch/goqu/sim/pulsesim" "github.com/splch/goqu/sim/statevector" "github.com/splch/goqu/transpile/target" @@ -19,11 +20,16 @@ import ( var _ backend.Backend = (*Backend)(nil) +// SimFactory creates a Simulator for the given number of qubits. +// The default factory creates a CPU statevector simulator. +type SimFactory func(numQubits int) (sim.Simulator, error) + // Backend runs circuits on the local statevector simulator. type Backend struct { - maxQubits int - results sync.Map // jobID → *backend.Result - logger *slog.Logger + maxQubits int + simFactory SimFactory + results sync.Map // jobID → *backend.Result + logger *slog.Logger } // Option configures a local Backend. @@ -39,9 +45,21 @@ func WithLogger(l *slog.Logger) Option { return func(b *Backend) { b.logger = l } } +// WithSimulator sets a custom simulator factory. This allows plugging in +// alternative simulators (e.g., GPU-accelerated) while keeping the same backend API. +func WithSimulator(f SimFactory) Option { + return func(b *Backend) { b.simFactory = f } +} + // New creates a local simulator backend. func New(opts ...Option) *Backend { - b := &Backend{maxQubits: 28, logger: slog.Default()} + b := &Backend{ + maxQubits: 28, + simFactory: func(numQubits int) (sim.Simulator, error) { + return statevector.New(numQubits), nil + }, + logger: slog.Default(), + } for _, opt := range opts { opt(b) } @@ -92,8 +110,15 @@ func (b *Backend) Submit(ctx context.Context, req *backend.SubmitRequest) (*back ) start := time.Now() - sim := statevector.New(nq) - counts, err := sim.Run(req.Circuit, req.Shots) + s, sErr := b.simFactory(nq) + if sErr != nil { + if simDone != nil { + simDone(sErr) + } + return nil, fmt.Errorf("local: %w", sErr) + } + defer func() { _ = s.Close() }() + counts, err := s.Run(req.Circuit, req.Shots) elapsed := time.Since(start) if simDone != nil { diff --git a/sim/densitymatrix/sim.go b/sim/densitymatrix/sim.go index fcca2eb..b3e11bd 100644 --- a/sim/densitymatrix/sim.go +++ b/sim/densitymatrix/sim.go @@ -12,6 +12,9 @@ import ( "github.com/splch/goqu/sim/pauli" ) +// Close is a no-op for the CPU density matrix simulator. +func (s *Sim) Close() error { return nil } + // parallelThreshold is the minimum number of qubits before enabling parallel kernels. // At 9 qubits dim=512 and the density matrix has 262K elements; the heavier // per-element work (row + column passes) justifies a lower threshold than statevector. diff --git a/sim/gpu/cuda/benchmark_test.go b/sim/gpu/cuda/benchmark_test.go new file mode 100644 index 0000000..cd38997 --- /dev/null +++ b/sim/gpu/cuda/benchmark_test.go @@ -0,0 +1,64 @@ +//go:build cuda + +package cuda + +import ( + "testing" + + "github.com/splch/goqu/circuit/builder" + "github.com/splch/goqu/sim/statevector" +) + +// BenchmarkGPU_GHZ benchmarks GPU GHZ circuit creation across qubit counts. +func BenchmarkGPU_GHZ(b *testing.B) { + for _, nq := range []int{12, 16, 20, 24, 28} { + bld := builder.New("ghz", nq) + bld.H(0) + for i := range nq - 1 { + bld.CNOT(i, i+1) + } + c, err := bld.Build() + if err != nil { + b.Fatal(err) + } + + b.Run(qName("GPU", nq), func(b *testing.B) { + sim, err := New(nq) + if err != nil { + b.Skip("CUDA not available:", err) + } + defer sim.Close() + b.ResetTimer() + for range b.N { + sim.Evolve(c) + } + }) + } +} + +// BenchmarkCPU_GHZ benchmarks CPU GHZ for comparison. +func BenchmarkCPU_GHZ(b *testing.B) { + for _, nq := range []int{12, 16, 20, 24, 28} { + bld := builder.New("ghz", nq) + bld.H(0) + for i := range nq - 1 { + bld.CNOT(i, i+1) + } + c, err := bld.Build() + if err != nil { + b.Fatal(err) + } + + b.Run(qName("CPU", nq), func(b *testing.B) { + sim := statevector.New(nq) + b.ResetTimer() + for range b.N { + sim.Evolve(c) + } + }) + } +} + +func qName(prefix string, nq int) string { + return prefix + "_" + string(rune('0'+nq/10)) + string(rune('0'+nq%10)) + "Q" +} diff --git a/sim/gpu/cuda/expect.go b/sim/gpu/cuda/expect.go new file mode 100644 index 0000000..53a79f9 --- /dev/null +++ b/sim/gpu/cuda/expect.go @@ -0,0 +1,79 @@ +//go:build cuda + +package cuda + +/* +#include + +// goPauliExpect computes ⟨ψ|P|ψ⟩ for a Pauli string on the GPU. +static custatevecStatus_t goPauliExpect( + custatevecHandle_t handle, + const void *sv, cudaDataType_t svDataType, int nQubits, + double *expectationValue, + const custatevecPauli_t *pauliOps, const int32_t *basisBits, int nBasisBits +) { + return custatevecComputeExpectation( + handle, sv, svDataType, (uint32_t)nQubits, + expectationValue, CUDA_R_64F, + pauliOps, basisBits, (uint32_t)nBasisBits + ); +} +*/ +import "C" +import ( + "fmt" + + "github.com/splch/goqu/sim/pauli" +) + +// pauliToCUSV maps goqu Pauli values to cuStateVec Pauli enum values. +func pauliToCUSV(p pauli.Pauli) C.custatevecPauli_t { + switch p { + case pauli.I: + return C.CUSTATEVEC_PAULI_I + case pauli.X: + return C.CUSTATEVEC_PAULI_X + case pauli.Y: + return C.CUSTATEVEC_PAULI_Y + case pauli.Z: + return C.CUSTATEVEC_PAULI_Z + default: + return C.CUSTATEVEC_PAULI_I + } +} + +// ExpectPauliString computes Re(⟨ψ|P|ψ⟩) for a Pauli string P on the GPU. +func (s *Sim) ExpectPauliString(ps pauli.PauliString) (float64, error) { + if ps.NumQubits() != s.numQubits { + return 0, fmt.Errorf("cuda: PauliString has %d qubits, simulator has %d", + ps.NumQubits(), s.numQubits) + } + + ops := ps.Ops() + nBasis := len(ops) + if nBasis == 0 { + return real(ps.Coeff()), nil + } + + pauliOps := make([]C.custatevecPauli_t, nBasis) + basisBits := make([]C.int32_t, nBasis) + i := 0 + for qubit, p := range ops { + pauliOps[i] = pauliToCUSV(p) + basisBits[i] = C.int32_t(qubit) + i++ + } + + var expect C.double + st := C.goPauliExpect( + s.handle.h, + s.devicePtr.ptr, C.CUDA_C_64F, C.int(s.numQubits), + &expect, + &pauliOps[0], &basisBits[0], C.int(nBasis), + ) + if st != C.CUSTATEVEC_STATUS_SUCCESS { + return 0, fmt.Errorf("custatevecComputeExpectation failed: status %d", int(st)) + } + + return real(ps.Coeff()) * float64(expect), nil +} diff --git a/sim/gpu/cuda/go.mod b/sim/gpu/cuda/go.mod new file mode 100644 index 0000000..36aae87 --- /dev/null +++ b/sim/gpu/cuda/go.mod @@ -0,0 +1,7 @@ +module github.com/splch/goqu/sim/gpu/cuda + +go 1.24 + +require github.com/splch/goqu v0.0.0 + +replace github.com/splch/goqu => ../../../ diff --git a/sim/gpu/cuda/handle.go b/sim/gpu/cuda/handle.go new file mode 100644 index 0000000..81d71c2 --- /dev/null +++ b/sim/gpu/cuda/handle.go @@ -0,0 +1,51 @@ +//go:build cuda + +package cuda + +/* +#cgo LDFLAGS: -lcustatevec -lcudart +#include +#include + +// createHandle wraps custatevecCreate. +static custatevecStatus_t goCreateHandle(custatevecHandle_t *handle) { + return custatevecCreate(handle); +} + +// destroyHandle wraps custatevecDestroy. +static custatevecStatus_t goDestroyHandle(custatevecHandle_t handle) { + return custatevecDestroy(handle); +} +*/ +import "C" +import ( + "fmt" + "unsafe" +) + +type cusvHandle struct { + h C.custatevecHandle_t + stream C.cudaStream_t +} + +type deviceAlloc struct { + ptr unsafe.Pointer + size int // number of complex128 elements +} + +func createHandle() (cusvHandle, error) { + var h cusvHandle + if st := C.goCreateHandle(&h.h); st != C.CUSTATEVEC_STATUS_SUCCESS { + return h, fmt.Errorf("custatevecCreate failed: status %d", int(st)) + } + if st := C.cudaStreamCreate(&h.stream); st != C.cudaSuccess { + C.goDestroyHandle(h.h) + return h, fmt.Errorf("cudaStreamCreate failed: status %d", int(st)) + } + return h, nil +} + +func destroyHandle(h cusvHandle) { + C.cudaStreamDestroy(h.stream) + C.goDestroyHandle(h.h) +} diff --git a/sim/gpu/cuda/kernels.go b/sim/gpu/cuda/kernels.go new file mode 100644 index 0000000..384861f --- /dev/null +++ b/sim/gpu/cuda/kernels.go @@ -0,0 +1,253 @@ +//go:build cuda + +package cuda + +/* +#include + +// applyMatrix wraps custatevecApplyMatrix for gate application. +static custatevecStatus_t goApplyMatrix( + custatevecHandle_t handle, + void *sv, cudaDataType_t svDataType, int nQubits, + const void *matrix, cudaDataType_t matrixDataType, + custatevecMatrixLayout_t layout, + int adjoint, + const int32_t *targets, int nTargets, + const int32_t *controls, const int32_t *controlBitValues, int nControls, + custatevecComputeType_t computeType, + void *workspace, size_t workspaceSize +) { + return custatevecApplyMatrix( + handle, sv, svDataType, (uint32_t)nQubits, + matrix, matrixDataType, layout, (int32_t)adjoint, + targets, (uint32_t)nTargets, + controls, controlBitValues, (uint32_t)nControls, + computeType, workspace, workspaceSize + ); +} + +// getApplyMatrixWorkspaceSize wraps custatevecApplyMatrixGetWorkspaceSize. +static custatevecStatus_t goGetWorkspaceSize( + custatevecHandle_t handle, + cudaDataType_t svDataType, int nQubits, + const void *matrix, cudaDataType_t matrixDataType, + custatevecMatrixLayout_t layout, + int adjoint, + int nTargets, int nControls, + custatevecComputeType_t computeType, + size_t *workspaceSize +) { + return custatevecApplyMatrixGetWorkspaceSize( + handle, svDataType, (uint32_t)nQubits, + matrix, matrixDataType, layout, (int32_t)adjoint, + (uint32_t)nTargets, (uint32_t)nControls, + computeType, workspaceSize + ); +} +*/ +import "C" +import ( + "fmt" + "math" + "unsafe" + + "github.com/splch/goqu/circuit/gate" + "github.com/splch/goqu/circuit/ir" +) + +// applyGate sends a gate matrix to the GPU and applies it via custatevecApplyMatrix. +func applyGate(s *Sim, targets []int, controls []int, m []complex128) error { + nTargets := len(targets) + nControls := len(controls) + + // Convert to int32 slices for C. + tgts := make([]C.int32_t, nTargets) + for i, t := range targets { + tgts[i] = C.int32_t(t) + } + + var ctrlPtr *C.int32_t + var ctrlBitsPtr *C.int32_t + var ctrlBits []C.int32_t + if nControls > 0 { + ctrls := make([]C.int32_t, nControls) + ctrlBits = make([]C.int32_t, nControls) + for i, c := range controls { + ctrls[i] = C.int32_t(c) + ctrlBits[i] = 1 // control on |1> + } + ctrlPtr = &ctrls[0] + ctrlBitsPtr = &ctrlBits[0] + } + + // Query workspace size. + var wsSize C.size_t + st := C.goGetWorkspaceSize( + s.handle.h, + C.CUDA_C_64F, C.int(s.numQubits), + unsafe.Pointer(&m[0]), C.CUDA_C_64F, + C.CUSTATEVEC_MATRIX_LAYOUT_ROW, + 0, + C.int(nTargets), C.int(nControls), + C.CUSTATEVEC_COMPUTE_64F, + &wsSize, + ) + if st != C.CUSTATEVEC_STATUS_SUCCESS { + return fmt.Errorf("custatevecApplyMatrixGetWorkspaceSize failed: status %d", int(st)) + } + + // Allocate workspace if needed. + var wsPtr unsafe.Pointer + if wsSize > 0 { + if cst := C.cudaMalloc(&wsPtr, wsSize); cst != C.cudaSuccess { + return fmt.Errorf("cudaMalloc workspace failed: status %d", int(cst)) + } + defer C.cudaFree(wsPtr) + } + + // Apply the gate. + st = C.goApplyMatrix( + s.handle.h, + s.devicePtr.ptr, C.CUDA_C_64F, C.int(s.numQubits), + unsafe.Pointer(&m[0]), C.CUDA_C_64F, + C.CUSTATEVEC_MATRIX_LAYOUT_ROW, + 0, + &tgts[0], C.int(nTargets), + ctrlPtr, ctrlBitsPtr, C.int(nControls), + C.CUSTATEVEC_COMPUTE_64F, + wsPtr, wsSize, + ) + if st != C.CUSTATEVEC_STATUS_SUCCESS { + return fmt.Errorf("custatevecApplyMatrix failed: status %d", int(st)) + } + return nil +} + +// evolve applies all gate operations in the circuit on the GPU. +func evolve(s *Sim, c *ir.Circuit) error { + if c.NumQubits() != s.numQubits { + return fmt.Errorf("circuit has %d qubits, simulator has %d", c.NumQubits(), s.numQubits) + } + if err := resetDevice(s.devicePtr); err != nil { + return err + } + + for _, op := range c.Ops() { + if op.Gate == nil || op.Gate.Name() == "barrier" { + continue + } + if op.Gate.Name() == "reset" { + // Transfer to host, reset qubit, transfer back. + host := copyToHost(s.devicePtr) + resetQubitCPU(host, s.numQubits, op.Qubits[0]) + C.goMemcpyH2D(s.devicePtr.ptr, unsafe.Pointer(&host[0]), C.size_t(len(host)*16)) + continue + } + if _, ok := op.Gate.(gate.StatePrepable); ok { + // State prep: copy amplitudes directly to GPU. + sp := op.Gate.(gate.StatePrepable) + amps := sp.Amplitudes() + if len(op.Qubits) == s.numQubits { + allInOrder := true + for i, q := range op.Qubits { + if q != i { + allInOrder = false + break + } + } + if allInOrder { + // Fast path: copy amplitudes directly. + C.goMemcpyH2D(s.devicePtr.ptr, unsafe.Pointer(&s[0]), C.size_t(len(amps)*16)) + continue + } + } + // Slow path: decompose into 1Q/2Q gates. + applied := op.Gate.Decompose(op.Qubits) + for _, a := range applied { + m := a.Gate.Matrix() + if m == nil { + continue + } + if err := applyGate(s, a.Qubits, nil, m); err != nil { + return err + } + } + continue + } + + m := op.Gate.Matrix() + if m == nil { + continue + } + + if cg, ok := op.Gate.(gate.ControlledGate); ok { + nControls := cg.NumControls() + controls := op.Qubits[:nControls] + targets := op.Qubits[nControls:] + innerM := cg.Inner().Matrix() + if innerM != nil { + if err := applyGate(s, targets, controls, innerM); err != nil { + return err + } + continue + } + } + + // General gate: all qubits are targets. + if err := applyGate(s, op.Qubits, nil, m); err != nil { + return err + } + } + return nil +} + +// resetQubitCPU deterministically resets a qubit to |0⟩ on the host side. +func resetQubitCPU(state []complex128, numQubits, qubit int) { + halfBlock := 1 << qubit + block := halfBlock << 1 + nAmps := 1 << numQubits + for b0 := 0; b0 < nAmps; b0 += block { + for offset := range halfBlock { + i0 := b0 + offset + i1 := i0 + halfBlock + a0, a1 := state[i0], state[i1] + norm := math.Sqrt(real(a0)*real(a0) + imag(a0)*imag(a0) + + real(a1)*real(a1) + imag(a1)*imag(a1)) + if norm > 1e-15 { + state[i0] = complex(norm, 0) + } else { + state[i0] = 0 + } + state[i1] = 0 + } + } +} + +// newSim creates a GPU simulator with cuStateVec. +func newSim(numQubits int) (*Sim, error) { + if numQubits < 1 { + return nil, fmt.Errorf("cuda: numQubits must be >= 1, got %d", numQubits) + } + h, err := createHandle() + if err != nil { + return nil, err + } + d, err := allocDevice(numQubits) + if err != nil { + destroyHandle(h) + return nil, err + } + return &Sim{numQubits: numQubits, handle: h, devicePtr: d}, nil +} + +// stateVector copies the full state from GPU to host. +func stateVector(s *Sim) []complex128 { + return copyToHost(s.devicePtr) +} + +// closeSim frees GPU resources. +func closeSim(s *Sim) error { + freeDevice(&s.devicePtr) + destroyHandle(s.handle) + return nil +} diff --git a/sim/gpu/cuda/kernels_stub.go b/sim/gpu/cuda/kernels_stub.go new file mode 100644 index 0000000..b58936a --- /dev/null +++ b/sim/gpu/cuda/kernels_stub.go @@ -0,0 +1,39 @@ +//go:build !cuda + +package cuda + +import ( + "fmt" + "unsafe" + + "github.com/splch/goqu/circuit/ir" +) + +type cusvHandle struct{} + +type deviceAlloc struct { + ptr unsafe.Pointer + size int +} + +var errNoCUDA = fmt.Errorf("cuda: not available (build with -tags cuda)") + +func newSim(numQubits int) (*Sim, error) { + return nil, errNoCUDA +} + +func run(_ *Sim, _ *ir.Circuit, _ int) (map[string]int, error) { + return nil, errNoCUDA +} + +func evolve(_ *Sim, _ *ir.Circuit) error { + return errNoCUDA +} + +func stateVector(_ *Sim) []complex128 { + return nil +} + +func closeSim(_ *Sim) error { + return nil +} diff --git a/sim/gpu/cuda/measure.go b/sim/gpu/cuda/measure.go new file mode 100644 index 0000000..758e562 --- /dev/null +++ b/sim/gpu/cuda/measure.go @@ -0,0 +1,197 @@ +//go:build cuda + +package cuda + +/* +#include + +// goAbs2Sum computes probabilities on Z-basis. +static custatevecStatus_t goAbs2Sum( + custatevecHandle_t handle, + const void *sv, cudaDataType_t svDataType, int nQubits, + double *abs2sum0, double *abs2sum1, + const int32_t *bitOrdering, int bitOrderingLen +) { + return custatevecAbs2SumOnZBasis( + handle, sv, svDataType, (uint32_t)nQubits, + abs2sum0, abs2sum1, + bitOrdering, (uint32_t)bitOrderingLen + ); +} + +// goSamplerCreate creates a batched sampler. +static custatevecStatus_t goSamplerCreate( + custatevecHandle_t handle, + const void *sv, cudaDataType_t svDataType, int nQubits, + custatevecSamplerDescriptor_t *sampler, + int nMaxShots, + void *workspace, size_t workspaceSize +) { + return custatevecSamplerCreate( + handle, sv, svDataType, (uint32_t)nQubits, + sampler, (uint32_t)nMaxShots, + workspace, workspaceSize + ); +} + +// goSamplerGetWorkspaceSize gets required workspace. +static custatevecStatus_t goSamplerGetWorkspaceSize( + custatevecHandle_t handle, + cudaDataType_t svDataType, int nQubits, + int nMaxShots, + size_t *workspaceSize +) { + return custatevecSamplerPreprocess( + handle, NULL, svDataType, (uint32_t)nQubits, + NULL, (uint32_t)nMaxShots, + NULL, workspaceSize + ); +} + +// goSamplerSample draws samples. +static custatevecStatus_t goSamplerSample( + custatevecSamplerDescriptor_t sampler, + custatevecIndex_t *bitStrings, + const int32_t *bitOrdering, int bitOrderingLen, + const double *randnums, int nShots, + enum custatevecSamplerOutput_t output +) { + return custatevecSamplerSample( + sampler, bitStrings, + bitOrdering, (uint32_t)bitOrderingLen, + randnums, (uint32_t)nShots, output + ); +} + +// goSamplerDestroy destroys the sampler. +static custatevecStatus_t goSamplerDestroy(custatevecSamplerDescriptor_t sampler) { + return custatevecSamplerDestroy(sampler); +} + +// goSamplerPreprocess preprocesses the statevector for sampling. +static custatevecStatus_t goSamplerPreprocess( + custatevecHandle_t handle, + custatevecSamplerDescriptor_t sampler, + cudaDataType_t svDataType, int nQubits, + void *workspace, size_t workspaceSize +) { + size_t ws = workspaceSize; + return custatevecSamplerPreprocess( + handle, sampler, svDataType, (uint32_t)nQubits, + workspace, 0, + workspace, &ws + ); +} +*/ +import "C" +import ( + "fmt" + "math/rand/v2" + "unsafe" + + "github.com/splch/goqu/circuit/ir" +) + +// run executes the circuit and samples measurement results on the GPU. +func run(s *Sim, c *ir.Circuit, shots int) (map[string]int, error) { + if c.IsDynamic() { + return nil, fmt.Errorf("cuda: dynamic circuits not supported") + } + if err := evolve(s, c); err != nil { + return nil, err + } + if shots <= 0 { + return make(map[string]int), nil + } + + // Build bit ordering: [0, 1, ..., nQubits-1]. + bitOrdering := make([]C.int32_t, s.numQubits) + for i := range s.numQubits { + bitOrdering[i] = C.int32_t(i) + } + + // Create sampler. + var sampler C.custatevecSamplerDescriptor_t + var wsSize C.size_t + + // Get workspace size. + st := C.custatevecSamplerPreprocess( + s.handle.h, nil, C.CUDA_C_64F, C.uint32_t(s.numQubits), + nil, C.uint32_t(shots), + nil, &wsSize, + ) + if st != C.CUSTATEVEC_STATUS_SUCCESS { + return nil, fmt.Errorf("custatevecSamplerPreprocess (size) failed: status %d", int(st)) + } + + // Allocate workspace. + var wsPtr unsafe.Pointer + if wsSize > 0 { + if cst := C.cudaMalloc(&wsPtr, wsSize); cst != C.cudaSuccess { + return nil, fmt.Errorf("cudaMalloc sampler workspace failed: status %d", int(cst)) + } + defer C.cudaFree(wsPtr) + } + + // Create the sampler. + st = C.goSamplerCreate( + s.handle.h, + s.devicePtr.ptr, C.CUDA_C_64F, C.int(s.numQubits), + &sampler, C.int(shots), + wsPtr, wsSize, + ) + if st != C.CUSTATEVEC_STATUS_SUCCESS { + return nil, fmt.Errorf("custatevecSamplerCreate failed: status %d", int(st)) + } + defer C.goSamplerDestroy(sampler) + + // Preprocess. + st = C.goSamplerPreprocess( + s.handle.h, sampler, + C.CUDA_C_64F, C.int(s.numQubits), + wsPtr, wsSize, + ) + if st != C.CUSTATEVEC_STATUS_SUCCESS { + return nil, fmt.Errorf("custatevecSamplerPreprocess failed: status %d", int(st)) + } + + // Generate random numbers. + rng := rand.New(rand.NewPCG(rand.Uint64(), rand.Uint64())) + randNums := make([]C.double, shots) + for i := range shots { + randNums[i] = C.double(rng.Float64()) + } + + // Sample. + bitStrings := make([]C.custatevecIndex_t, shots) + st = C.goSamplerSample( + sampler, + &bitStrings[0], + &bitOrdering[0], C.int(s.numQubits), + &randNums[0], C.int(shots), + C.CUSTATEVEC_SAMPLER_OUTPUT_RANDNUM_ORDER, + ) + if st != C.CUSTATEVEC_STATUS_SUCCESS { + return nil, fmt.Errorf("custatevecSamplerSample failed: status %d", int(st)) + } + + // Convert to bitstring counts. + counts := make(map[string]int) + for _, idx := range bitStrings { + bs := formatBitstring(int(idx), s.numQubits) + counts[bs]++ + } + return counts, nil +} + +func formatBitstring(idx, n int) string { + bs := make([]byte, n) + for i := range n { + if idx&(1< +#include + +static cudaError_t goMalloc(void **ptr, size_t size) { + return cudaMalloc(ptr, size); +} + +static cudaError_t goFree(void *ptr) { + return cudaFree(ptr); +} + +static cudaError_t goMemcpyH2D(void *dst, const void *src, size_t size) { + return cudaMemcpy(dst, src, size, cudaMemcpyHostToDevice); +} + +static cudaError_t goMemcpyD2H(void *dst, const void *src, size_t size) { + return cudaMemcpy(dst, src, size, cudaMemcpyDeviceToHost); +} +*/ +import "C" +import ( + "fmt" + "unsafe" +) + +// allocDevice allocates GPU memory for 2^numQubits complex128 values +// and initializes to |0...0>. +func allocDevice(numQubits int) (deviceAlloc, error) { + n := 1 << numQubits + byteSize := C.size_t(n * 16) // complex128 = 16 bytes + + var d deviceAlloc + d.size = n + + if st := C.goMalloc(&d.ptr, byteSize); st != C.cudaSuccess { + return d, fmt.Errorf("cudaMalloc failed: status %d", int(st)) + } + + // Zero the device memory, then set |0> = 1+0i. + if st := C.cudaMemset(d.ptr, 0, byteSize); st != C.cudaSuccess { + C.goFree(d.ptr) + return d, fmt.Errorf("cudaMemset failed: status %d", int(st)) + } + + // Set amplitude[0] = 1+0i on the host side, then copy. + init := complex(1.0, 0.0) + if st := C.goMemcpyH2D(d.ptr, unsafe.Pointer(&init), 16); st != C.cudaSuccess { + C.goFree(d.ptr) + return d, fmt.Errorf("cudaMemcpy H2D failed: status %d", int(st)) + } + + return d, nil +} + +// resetDevice re-initializes the state vector to |0...0>. +func resetDevice(d deviceAlloc) error { + byteSize := C.size_t(d.size * 16) + if st := C.cudaMemset(d.ptr, 0, byteSize); st != C.cudaSuccess { + return fmt.Errorf("cudaMemset failed: status %d", int(st)) + } + init := complex(1.0, 0.0) + if st := C.goMemcpyH2D(d.ptr, unsafe.Pointer(&init), 16); st != C.cudaSuccess { + return fmt.Errorf("cudaMemcpy H2D failed: status %d", int(st)) + } + return nil +} + +// copyToHost transfers the full state vector from GPU to a host slice. +func copyToHost(d deviceAlloc) []complex128 { + out := make([]complex128, d.size) + C.goMemcpyD2H(unsafe.Pointer(&out[0]), d.ptr, C.size_t(d.size*16)) + return out +} + +// freeDevice releases GPU memory. +func freeDevice(d *deviceAlloc) { + if d.ptr != nil { + C.goFree(d.ptr) + d.ptr = nil + } +} diff --git a/sim/gpu/cuda/sim.go b/sim/gpu/cuda/sim.go new file mode 100644 index 0000000..a16d628 --- /dev/null +++ b/sim/gpu/cuda/sim.go @@ -0,0 +1,59 @@ +// Package cuda provides a GPU-accelerated statevector simulator using NVIDIA cuStateVec. +// +// The simulator satisfies [sim.Simulator] and can be used as a drop-in replacement +// for the CPU statevector simulator via [backend/local.WithSimulator]: +// +// import gpucuda "github.com/splch/goqu/sim/gpu/cuda" +// local.New(local.WithSimulator(gpucuda.NewSimulator)) +// +// Build requires the cuda build tag and a working cuStateVec installation. +// Without the tag, stub implementations return an error. +package cuda + +import ( + "github.com/splch/goqu/circuit/ir" + "github.com/splch/goqu/sim" +) + +var _ sim.Simulator = (*Sim)(nil) + +// Sim is a GPU-accelerated statevector simulator backed by cuStateVec. +// The state vector lives on the GPU for its entire lifetime; only gate matrices +// (16-128 bytes each) are transferred per operation. +type Sim struct { + numQubits int + handle cusvHandle // cuStateVec handle + CUDA stream + devicePtr deviceAlloc // GPU memory for the state vector +} + +// NewSimulator is a [sim.SimFactory] that creates a GPU simulator. +// It can be passed directly to [backend/local.WithSimulator]. +func NewSimulator(numQubits int) (sim.Simulator, error) { + return New(numQubits) +} + +// New creates a GPU simulator initialized to |0...0>. +// Returns an error if CUDA or cuStateVec initialization fails. +func New(numQubits int) (*Sim, error) { + return newSim(numQubits) +} + +// Run executes the circuit for the given number of shots and returns measurement counts. +func (s *Sim) Run(c *ir.Circuit, shots int) (map[string]int, error) { + return run(s, c, shots) +} + +// Evolve applies all gate operations without measuring. +func (s *Sim) Evolve(c *ir.Circuit) error { + return evolve(s, c) +} + +// StateVector copies the state vector from GPU to host and returns it. +func (s *Sim) StateVector() []complex128 { + return stateVector(s) +} + +// Close releases the cuStateVec handle and frees GPU memory. +func (s *Sim) Close() error { + return closeSim(s) +} diff --git a/sim/gpu/cuda/sim_test.go b/sim/gpu/cuda/sim_test.go new file mode 100644 index 0000000..0901403 --- /dev/null +++ b/sim/gpu/cuda/sim_test.go @@ -0,0 +1,269 @@ +//go:build cuda + +package cuda + +import ( + "math" + "math/cmplx" + "testing" + + "github.com/splch/goqu/circuit/builder" + "github.com/splch/goqu/circuit/gate" + "github.com/splch/goqu/sim/statevector" +) + +const eps = 1e-10 + +func TestBellState(t *testing.T) { + c, err := builder.New("bell", 2). + H(0). + CNOT(0, 1). + Build() + if err != nil { + t.Fatal(err) + } + + sim, err := New(2) + if err != nil { + t.Fatal(err) + } + defer sim.Close() + + if err := sim.Evolve(c); err != nil { + t.Fatal(err) + } + + sv := sim.StateVector() + s2 := 1.0 / math.Sqrt2 + want := []complex128{complex(s2, 0), 0, 0, complex(s2, 0)} + assertStateClose(t, sv, want) +} + +func TestGHZ3(t *testing.T) { + c, err := builder.New("ghz3", 3). + H(0). + CNOT(0, 1). + CNOT(1, 2). + Build() + if err != nil { + t.Fatal(err) + } + + sim, err := New(3) + if err != nil { + t.Fatal(err) + } + defer sim.Close() + + if err := sim.Evolve(c); err != nil { + t.Fatal(err) + } + + sv := sim.StateVector() + s2 := 1.0 / math.Sqrt2 + want := []complex128{complex(s2, 0), 0, 0, 0, 0, 0, 0, complex(s2, 0)} + assertStateClose(t, sv, want) +} + +func TestSingleX(t *testing.T) { + c, err := builder.New("x", 1).X(0).Build() + if err != nil { + t.Fatal(err) + } + + sim, err := New(1) + if err != nil { + t.Fatal(err) + } + defer sim.Close() + + if err := sim.Evolve(c); err != nil { + t.Fatal(err) + } + + sv := sim.StateVector() + want := []complex128{0, 1} + assertStateClose(t, sv, want) +} + +func TestMeasurementCounts(t *testing.T) { + c, err := builder.New("bell", 2). + H(0). + CNOT(0, 1). + MeasureAll(). + Build() + if err != nil { + t.Fatal(err) + } + + sim, err := New(2) + if err != nil { + t.Fatal(err) + } + defer sim.Close() + + counts, err := sim.Run(c, 10000) + if err != nil { + t.Fatal(err) + } + + for k := range counts { + if k != "00" && k != "11" { + t.Errorf("unexpected measurement outcome: %q", k) + } + } + c00 := counts["00"] + c11 := counts["11"] + if c00 < 4000 || c00 > 6000 { + t.Errorf("counts[00] = %d, expected ~5000", c00) + } + if c11 < 4000 || c11 > 6000 { + t.Errorf("counts[11] = %d, expected ~5000", c11) + } +} + +func TestSWAP(t *testing.T) { + c, err := builder.New("swap", 2). + X(0). + SWAP(0, 1). + Build() + if err != nil { + t.Fatal(err) + } + + sim, err := New(2) + if err != nil { + t.Fatal(err) + } + defer sim.Close() + + if err := sim.Evolve(c); err != nil { + t.Fatal(err) + } + + sv := sim.StateVector() + want := []complex128{0, 0, 1, 0} + assertStateClose(t, sv, want) +} + +func TestCCX_Toffoli(t *testing.T) { + c, err := builder.New("toffoli", 3). + X(1). + X(2). + CCX(2, 1, 0). + Build() + if err != nil { + t.Fatal(err) + } + + sim, err := New(3) + if err != nil { + t.Fatal(err) + } + defer sim.Close() + + if err := sim.Evolve(c); err != nil { + t.Fatal(err) + } + + sv := sim.StateVector() + for i, v := range sv { + if i == 7 { + if cmplx.Abs(v-1) > eps { + t.Errorf("sv[7] = %v, want 1", v) + } + } else { + if cmplx.Abs(v) > eps { + t.Errorf("sv[%d] = %v, want 0", i, v) + } + } + } +} + +// TestCrossValidation runs random circuits on both CPU and GPU, asserting state vectors match. +func TestCrossValidation(t *testing.T) { + gates1Q := []func(int) *builder.Builder{ + func(q int) *builder.Builder { return builder.New("", 4).H(q) }, + func(q int) *builder.Builder { return builder.New("", 4).X(q) }, + func(q int) *builder.Builder { return builder.New("", 4).Y(q) }, + func(q int) *builder.Builder { return builder.New("", 4).Z(q) }, + func(q int) *builder.Builder { return builder.New("", 4).S(q) }, + func(q int) *builder.Builder { return builder.New("", 4).T(q) }, + } + + for i, gf := range gates1Q { + bld := gf(i % 4) + bld.CNOT(0, 1) + bld.H(2) + bld.CNOT(2, 3) + + c, err := bld.Build() + if err != nil { + t.Fatal(err) + } + + cpuSim := statevector.New(4) + if err := cpuSim.Evolve(c); err != nil { + t.Fatal(err) + } + cpuSV := cpuSim.StateVector() + + gpuSim, err := New(4) + if err != nil { + t.Fatal(err) + } + if err := gpuSim.Evolve(c); err != nil { + gpuSim.Close() + t.Fatal(err) + } + gpuSV := gpuSim.StateVector() + gpuSim.Close() + + assertStateClose(t, gpuSV, cpuSV) + } +} + +// TestQFT3 verifies QFT on |000> produces uniform superposition. +func TestQFT3(t *testing.T) { + c, err := builder.New("qft3", 3). + H(0). + Apply(gate.CP(math.Pi/2), 1, 0). + Apply(gate.CP(math.Pi/4), 2, 0). + H(1). + Apply(gate.CP(math.Pi/2), 2, 1). + H(2). + Build() + if err != nil { + t.Fatal(err) + } + + sim, err := New(3) + if err != nil { + t.Fatal(err) + } + defer sim.Close() + + if err := sim.Evolve(c); err != nil { + t.Fatal(err) + } + + sv := sim.StateVector() + amp := 1.0 / math.Sqrt(8) + for i, v := range sv { + if math.Abs(cmplx.Abs(v)-amp) > eps { + t.Errorf("|sv[%d]| = %f, want %f", i, cmplx.Abs(v), amp) + } + } +} + +func assertStateClose(t *testing.T, got, want []complex128) { + t.Helper() + if len(got) != len(want) { + t.Fatalf("state size %d, want %d", len(got), len(want)) + } + for i := range got { + if cmplx.Abs(got[i]-want[i]) > eps { + t.Errorf("state[%d] = %v, want %v", i, got[i], want[i]) + } + } +} diff --git a/sim/gpu/metal/device.go b/sim/gpu/metal/device.go new file mode 100644 index 0000000..d0a14ee --- /dev/null +++ b/sim/gpu/metal/device.go @@ -0,0 +1,33 @@ +//go:build !darwin + +package metal + +import ( + "fmt" + + "github.com/splch/goqu/circuit/ir" +) + +type metalDevice struct{} + +var errNoMetal = fmt.Errorf("metal: not available (build with -tags metal)") + +func newSim(_ int) (*Sim, error) { + return nil, errNoMetal +} + +func run(_ *Sim, _ *ir.Circuit, _ int) (map[string]int, error) { + return nil, errNoMetal +} + +func evolve(_ *Sim, _ *ir.Circuit) error { + return errNoMetal +} + +func stateVector(_ *Sim) []complex128 { + return nil +} + +func closeSim(_ *Sim) error { + return nil +} diff --git a/sim/gpu/metal/device_darwin.go b/sim/gpu/metal/device_darwin.go new file mode 100644 index 0000000..332b7cc --- /dev/null +++ b/sim/gpu/metal/device_darwin.go @@ -0,0 +1,127 @@ +//go:build darwin + +package metal + +/* +#cgo CFLAGS: -fno-objc-arc +#cgo LDFLAGS: -framework Metal -framework Foundation +#include "metal_bridge.h" +#include +*/ +import "C" +import ( + "fmt" + "unsafe" +) + +type metalDevice struct { + sim *C.MetalSim +} + +func newSim(numQubits int) (*Sim, error) { + var errStr *C.char + msim := C.MetalCreate(C.int(numQubits), &errStr) + if msim == nil { + msg := C.GoString(errStr) + C.free(unsafe.Pointer(errStr)) + return nil, fmt.Errorf("metal: %s", msg) + } + return &Sim{ + numQubits: numQubits, + device: metalDevice{sim: msim}, + }, nil +} + +func closeSim(s *Sim) error { + if s.device.sim != nil { + C.MetalDestroy(s.device.sim) + s.device.sim = nil + } + return nil +} + +// stateVector returns a copy of the GPU state as []complex128 (converting float32→float64). +func stateVector(s *Sim) []complex128 { + nAmps := 1 << s.numQubits + out := make([]complex128, nAmps) + ptr := C.MetalStatePtr(s.device.sim) + src := unsafe.Slice((*float32)(unsafe.Pointer(ptr)), nAmps*2) + for i := range nAmps { + out[i] = complex(float64(src[2*i]), float64(src[2*i+1])) + } + return out +} + +// stateVectorF32 returns a slice of float32 pairs backed by the shared Metal buffer. +func stateVectorF32(s *Sim) []float32 { + nAmps := 1 << s.numQubits + ptr := C.MetalStatePtr(s.device.sim) + return unsafe.Slice((*float32)(unsafe.Pointer(ptr)), nAmps*2) +} + +// --- Go wrappers for C bridge functions --- + +func metalResetState(s *Sim) { + C.MetalResetState(s.device.sim) +} + +func metalBeginPass(s *Sim) error { + var errStr *C.char + if C.MetalBeginPass(s.device.sim, &errStr) != 0 { + msg := C.GoString(errStr) + C.free(unsafe.Pointer(errStr)) + return fmt.Errorf("metal: %s", msg) + } + return nil +} + +func metalEndPass(s *Sim) error { + var errStr *C.char + if C.MetalEndPass(s.device.sim, &errStr) != 0 { + msg := C.GoString(errStr) + C.free(unsafe.Pointer(errStr)) + return fmt.Errorf("metal: %s", msg) + } + return nil +} + +// metalGate1Q dispatches a 1Q gate. m is a 2x2 complex128 matrix (4 elements). +func metalGate1Q(s *Sim, qubit int, m []complex128) error { + var fm [8]float32 + for i, c := range m { + fm[2*i] = float32(real(c)) + fm[2*i+1] = float32(imag(c)) + } + var errStr *C.char + if C.MetalGate1Q(s.device.sim, C.uint32_t(qubit), (*C.float)(&fm[0]), &errStr) != 0 { + msg := C.GoString(errStr) + C.free(unsafe.Pointer(errStr)) + return fmt.Errorf("metal: %s", msg) + } + return nil +} + +// metalGate2Q dispatches a 2Q gate. m is a 4x4 complex128 matrix (16 elements). +func metalGate2Q(s *Sim, q0, q1 int, m []complex128) error { + var fm [32]float32 + for i, c := range m { + fm[2*i] = float32(real(c)) + fm[2*i+1] = float32(imag(c)) + } + var errStr *C.char + if C.MetalGate2Q(s.device.sim, C.uint32_t(q0), C.uint32_t(q1), (*C.float)(&fm[0]), &errStr) != 0 { + msg := C.GoString(errStr) + C.free(unsafe.Pointer(errStr)) + return fmt.Errorf("metal: %s", msg) + } + return nil +} + +// writeStateF32 writes amplitudes to the shared buffer (converting float64→float32). +func writeStateF32(s *Sim, amps []complex128) { + buf := stateVectorF32(s) + for i, a := range amps { + buf[2*i] = float32(real(a)) + buf[2*i+1] = float32(imag(a)) + } +} diff --git a/sim/gpu/metal/go.mod b/sim/gpu/metal/go.mod new file mode 100644 index 0000000..ad09279 --- /dev/null +++ b/sim/gpu/metal/go.mod @@ -0,0 +1,7 @@ +module github.com/splch/goqu/sim/gpu/metal + +go 1.24 + +require github.com/splch/goqu v0.0.0 + +replace github.com/splch/goqu => ../../../ diff --git a/sim/gpu/metal/kernels_darwin.go b/sim/gpu/metal/kernels_darwin.go new file mode 100644 index 0000000..c95faba --- /dev/null +++ b/sim/gpu/metal/kernels_darwin.go @@ -0,0 +1,228 @@ +//go:build darwin + +package metal + +import ( + "fmt" + "math" + "math/rand/v2" + + "github.com/splch/goqu/circuit/gate" + "github.com/splch/goqu/circuit/ir" +) + +func evolve(s *Sim, c *ir.Circuit) error { + if c.NumQubits() != s.numQubits { + return fmt.Errorf("metal: circuit has %d qubits, simulator has %d", + c.NumQubits(), s.numQubits) + } + + metalResetState(s) + + if err := metalBeginPass(s); err != nil { + return err + } + passOpen := true + + endPass := func() error { + if passOpen { + passOpen = false + return metalEndPass(s) + } + return nil + } + beginPass := func() error { + if !passOpen { + if err := metalBeginPass(s); err != nil { + return err + } + passOpen = true + } + return nil + } + + for _, op := range c.Ops() { + if op.Gate == nil || op.Gate.Name() == "barrier" { + continue + } + + if op.Gate.Name() == "reset" { + if err := endPass(); err != nil { + return err + } + resetQubitCPU(s, op.Qubits[0]) + if err := beginPass(); err != nil { + return err + } + continue + } + + if sp, ok := op.Gate.(gate.StatePrepable); ok { + amps := sp.Amplitudes() + if len(op.Qubits) == s.numQubits { + allInOrder := true + for i, q := range op.Qubits { + if q != i { + allInOrder = false + break + } + } + if allInOrder { + if err := endPass(); err != nil { + return err + } + writeStateF32(s, amps) + if err := beginPass(); err != nil { + return err + } + continue + } + } + applied := op.Gate.Decompose(op.Qubits) + for _, a := range applied { + m := a.Gate.Matrix() + if m == nil { + continue + } + if err := dispatchOp(s, a.Gate, a.Qubits); err != nil { + _ = endPass() + return err + } + } + continue + } + + if err := dispatchOp(s, op.Gate, op.Qubits); err != nil { + _ = endPass() + return err + } + } + + return endPass() +} + +func dispatchOp(s *Sim, g gate.Gate, qubits []int) error { + switch g.Qubits() { + case 1: + return metalGate1Q(s, qubits[0], g.Matrix()) + case 2: + return dispatchGate2(s, qubits[0], qubits[1], g.Matrix()) + default: + applied := g.Decompose(qubits) + if applied != nil { + for _, a := range applied { + m := a.Gate.Matrix() + if m == nil { + continue + } + if err := dispatchOp(s, a.Gate, a.Qubits); err != nil { + return err + } + } + return nil + } + return fmt.Errorf("metal: unsupported gate %s (%d qubits)", + g.Name(), g.Qubits()) + } +} + +func dispatchGate2(s *Sim, q0, q1 int, m []complex128) error { + if q0 < q1 { + // CPU convention: row 1 = q1 set, row 2 = q0 set. + // Metal convention: a[1] = lower qubit (q0) set, a[2] = higher qubit (q1) set. + // When q0 < q1, rows 1 and 2 are swapped — permute the matrix. + m = permuteMatrix2Q(m) + return metalGate2Q(s, q0, q1, m) + } + // q0 > q1: swap qubit args for Metal (requires sorted). The CPU and Metal + // conventions happen to align when the original q0 is the higher qubit. + return metalGate2Q(s, q1, q0, m) +} + +// permuteMatrix2Q reorders a 4x4 gate matrix when swapping qubit order. +func permuteMatrix2Q(m []complex128) []complex128 { + perm := [4]int{0, 2, 1, 3} + out := make([]complex128, 16) + for r := range 4 { + for c := range 4 { + out[r*4+c] = m[perm[r]*4+perm[c]] + } + } + return out +} + +func run(s *Sim, c *ir.Circuit, shots int) (map[string]int, error) { + if c.IsDynamic() { + return nil, fmt.Errorf("metal: dynamic circuits not supported") + } + if err := evolve(s, c); err != nil { + return nil, err + } + + // Read float32 probabilities from shared buffer. + buf := stateVectorF32(s) + nAmps := 1 << s.numQubits + probs := make([]float64, nAmps) + for i := range nAmps { + re := float64(buf[2*i]) + im := float64(buf[2*i+1]) + probs[i] = re*re + im*im + } + + counts := make(map[string]int) + rng := rand.New(rand.NewPCG(rand.Uint64(), rand.Uint64())) + for range shots { + idx := sampleIndex(probs, rng) + bs := formatBitstring(idx, s.numQubits) + counts[bs]++ + } + return counts, nil +} + +func resetQubitCPU(s *Sim, qubit int) { + buf := stateVectorF32(s) + nAmps := 1 << s.numQubits + halfBlock := 1 << qubit + block := halfBlock << 1 + for b0 := 0; b0 < nAmps; b0 += block { + for offset := range halfBlock { + i0 := b0 + offset + i1 := i0 + halfBlock + r0, im0 := float64(buf[2*i0]), float64(buf[2*i0+1]) + r1, im1 := float64(buf[2*i1]), float64(buf[2*i1+1]) + norm := math.Sqrt(r0*r0 + im0*im0 + r1*r1 + im1*im1) + if norm > 1e-15 { + buf[2*i0] = float32(norm) + } else { + buf[2*i0] = 0 + } + buf[2*i0+1] = 0 + buf[2*i1] = 0 + buf[2*i1+1] = 0 + } + } +} + +func sampleIndex(probs []float64, rng *rand.Rand) int { + r := rng.Float64() + cum := 0.0 + for i, p := range probs { + cum += p + if r < cum { + return i + } + } + return len(probs) - 1 +} + +func formatBitstring(idx, n int) string { + bs := make([]byte, n) + for i := range n { + if idx&(1< + +typedef struct MetalSim MetalSim; + +// MetalCreate creates a Metal simulator for numQubits qubits. +// Returns NULL on error; *errOut is set to a strdup'd message (caller frees). +MetalSim* MetalCreate(int numQubits, char** errOut); + +// MetalDestroy releases all Metal resources. +void MetalDestroy(MetalSim* sim); + +// MetalStatePtr returns a pointer to the float32 state vector in shared memory. +// Layout: 2*numAmps floats (pairs of real, imag) — i.e., complex64. +float* MetalStatePtr(MetalSim* sim); + +// MetalNumAmps returns 2^numQubits. +int MetalNumAmps(MetalSim* sim); + +// MetalResetState sets the state to |0...0>. +void MetalResetState(MetalSim* sim); + +// MetalBeginPass creates a new command buffer and compute encoder. +int MetalBeginPass(MetalSim* sim, char** errOut); + +// MetalGate1Q encodes a 1-qubit gate. matrix: 8 floats (4 complex, row-major). +int MetalGate1Q(MetalSim* sim, uint32_t qubit, const float* matrix, char** errOut); + +// MetalGate2Q encodes a 2-qubit gate. qubit0 < qubit1 required. +// matrix: 32 floats (16 complex, row-major). +int MetalGate2Q(MetalSim* sim, uint32_t qubit0, uint32_t qubit1, + const float* matrix, char** errOut); + +// MetalEndPass commits the command buffer and waits for completion. +int MetalEndPass(MetalSim* sim, char** errOut); + +#endif diff --git a/sim/gpu/metal/metal_bridge.m b/sim/gpu/metal/metal_bridge.m new file mode 100644 index 0000000..c85d3d7 --- /dev/null +++ b/sim/gpu/metal/metal_bridge.m @@ -0,0 +1,359 @@ +#import +#import +#include "metal_bridge.h" +#include +#include + +#if __has_feature(objc_arc) +#error "This file must be compiled without ARC (-fno-objc-arc)" +#endif + +// --------------------------------------------------------------------------- +// Shader source (embedded) — 1Q and 2Q gate kernels using float2 (complex64). +// --------------------------------------------------------------------------- + +static const char *kShaderSource = +"#include \n" +"using namespace metal;\n" +"\n" +"// ---- 1-qubit gate ----\n" +"struct Gate1QParams {\n" +" uint qubit;\n" +" uint nAmps;\n" +" float m[8];\n" +"};\n" +"\n" +"inline float2 cmul(float2 a, float2 b) {\n" +" return float2(a.x*b.x - a.y*b.y, a.x*b.y + a.y*b.x);\n" +"}\n" +"inline float2 cadd(float2 a, float2 b) {\n" +" return float2(a.x + b.x, a.y + b.y);\n" +"}\n" +"\n" +"kernel void gate1q(\n" +" device float2 *sv [[buffer(0)]],\n" +" constant Gate1QParams ¶ms [[buffer(1)]],\n" +" uint gid [[thread_position_in_grid]]\n" +") {\n" +" uint halfBlock = 1u << params.qubit;\n" +" uint block = halfBlock << 1u;\n" +" uint blockIdx = gid / halfBlock;\n" +" uint offset = gid % halfBlock;\n" +" uint b0 = blockIdx * block;\n" +" uint i0 = b0 + offset;\n" +" uint i1 = i0 + halfBlock;\n" +" if (i1 >= params.nAmps) return;\n" +" float2 a0 = sv[i0];\n" +" float2 a1 = sv[i1];\n" +" float2 m00 = float2(params.m[0], params.m[1]);\n" +" float2 m01 = float2(params.m[2], params.m[3]);\n" +" float2 m10 = float2(params.m[4], params.m[5]);\n" +" float2 m11 = float2(params.m[6], params.m[7]);\n" +" sv[i0] = cadd(cmul(m00, a0), cmul(m01, a1));\n" +" sv[i1] = cadd(cmul(m10, a0), cmul(m11, a1));\n" +"}\n" +"\n" +"// ---- 2-qubit gate ----\n" +"struct Gate2QParams {\n" +" uint qubit0;\n" +" uint qubit1;\n" +" uint nAmps;\n" +" float m[32];\n" +"};\n" +"\n" +"kernel void gate2q(\n" +" device float2 *sv [[buffer(0)]],\n" +" constant Gate2QParams ¶ms [[buffer(1)]],\n" +" uint gid [[thread_position_in_grid]]\n" +") {\n" +" uint q0 = params.qubit0;\n" +" uint q1 = params.qubit1;\n" +" uint idx = gid;\n" +" uint lo0 = idx & ((1u << q0) - 1u);\n" +" uint hi0 = idx >> q0;\n" +" idx = (hi0 << (q0 + 1u)) | lo0;\n" +" uint lo1 = idx & ((1u << q1) - 1u);\n" +" uint hi1 = idx >> q1;\n" +" idx = (hi1 << (q1 + 1u)) | lo1;\n" +" uint i00 = idx;\n" +" uint i01 = idx | (1u << q0);\n" +" uint i10 = idx | (1u << q1);\n" +" uint i11 = idx | (1u << q0) | (1u << q1);\n" +" if (i11 >= params.nAmps) return;\n" +" float2 a[4] = { sv[i00], sv[i01], sv[i10], sv[i11] };\n" +" float2 r[4];\n" +" for (int row = 0; row < 4; row++) {\n" +" r[row] = float2(0.0, 0.0);\n" +" for (int col = 0; col < 4; col++) {\n" +" float2 mv = float2(params.m[(row*4+col)*2], params.m[(row*4+col)*2+1]);\n" +" r[row] = cadd(r[row], cmul(mv, a[col]));\n" +" }\n" +" }\n" +" sv[i00] = r[0];\n" +" sv[i01] = r[1];\n" +" sv[i10] = r[2];\n" +" sv[i11] = r[3];\n" +"}\n"; + +// --------------------------------------------------------------------------- +// C-side parameter structs (must match shader layout exactly). +// --------------------------------------------------------------------------- + +typedef struct { + uint32_t qubit; + uint32_t nAmps; + float m[8]; +} Gate1QParams; + +typedef struct { + uint32_t qubit0; + uint32_t qubit1; + uint32_t nAmps; + float m[32]; +} Gate2QParams; + +// --------------------------------------------------------------------------- +// MetalSim — opaque handle holding all Metal state. +// --------------------------------------------------------------------------- + +struct MetalSim { + int numQubits; + int numAmps; + id device; + id queue; + id pipe1q; + id pipe2q; + id svBuffer; + // Transient per-pass state (non-nil between BeginPass/EndPass). + id cmdBuf; + id encoder; + NSUInteger tgSize1q; + NSUInteger tgSize2q; +}; + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +static char* errdup(NSError *err) { + if (err) { + return strdup([[err localizedDescription] UTF8String]); + } + return strdup("unknown Metal error"); +} + +// --------------------------------------------------------------------------- +// Public API +// --------------------------------------------------------------------------- + +MetalSim* MetalCreate(int numQubits, char** errOut) { + @autoreleasepool { + if (numQubits < 1) { + *errOut = strdup("numQubits must be >= 1"); + return NULL; + } + + id device = MTLCreateSystemDefaultDevice(); + if (!device) { + *errOut = strdup("no Metal device available"); + return NULL; + } + + // Compile shaders. + NSError *error = nil; + NSString *src = [NSString stringWithUTF8String:kShaderSource]; + id library = [device newLibraryWithSource:src options:nil error:&error]; + if (!library) { + *errOut = errdup(error); + [device release]; + return NULL; + } + + id func1q = [library newFunctionWithName:@"gate1q"]; + id func2q = [library newFunctionWithName:@"gate2q"]; + if (!func1q || !func2q) { + *errOut = strdup("shader function not found"); + if (func1q) [func1q release]; + if (func2q) [func2q release]; + [library release]; + [device release]; + return NULL; + } + + id pipe1q = + [device newComputePipelineStateWithFunction:func1q error:&error]; + if (!pipe1q) { + *errOut = errdup(error); + [func1q release]; [func2q release]; + [library release]; [device release]; + return NULL; + } + id pipe2q = + [device newComputePipelineStateWithFunction:func2q error:&error]; + if (!pipe2q) { + *errOut = errdup(error); + [pipe1q release]; [func1q release]; [func2q release]; + [library release]; [device release]; + return NULL; + } + + [func1q release]; + [func2q release]; + [library release]; + + // Allocate shared-memory state-vector buffer (float2 = 8 bytes per amplitude). + int numAmps = 1 << numQubits; + NSUInteger bufSize = (NSUInteger)numAmps * 2 * sizeof(float); + id svBuffer = [device newBufferWithLength:bufSize + options:MTLResourceStorageModeShared]; + if (!svBuffer) { + *errOut = strdup("failed to allocate state vector buffer"); + [pipe2q release]; [pipe1q release]; [device release]; + return NULL; + } + + // Initialize to |0...0>: first amplitude = (1, 0). + float *ptr = (float*)[svBuffer contents]; + memset(ptr, 0, bufSize); + ptr[0] = 1.0f; + + id queue = [device newCommandQueue]; + + MetalSim *sim = (MetalSim*)calloc(1, sizeof(MetalSim)); + sim->numQubits = numQubits; + sim->numAmps = numAmps; + sim->device = device; + sim->queue = queue; + sim->pipe1q = pipe1q; + sim->pipe2q = pipe2q; + sim->svBuffer = svBuffer; + sim->cmdBuf = nil; + sim->encoder = nil; + sim->tgSize1q = [pipe1q maxTotalThreadsPerThreadgroup]; + sim->tgSize2q = [pipe2q maxTotalThreadsPerThreadgroup]; + if (sim->tgSize1q > 256) sim->tgSize1q = 256; + if (sim->tgSize2q > 256) sim->tgSize2q = 256; + return sim; + } +} + +void MetalDestroy(MetalSim* sim) { + if (!sim) return; + @autoreleasepool { + if (sim->encoder) { [sim->encoder endEncoding]; [sim->encoder release]; } + if (sim->cmdBuf) { [sim->cmdBuf release]; } + [sim->svBuffer release]; + [sim->pipe2q release]; + [sim->pipe1q release]; + [sim->queue release]; + [sim->device release]; + } + free(sim); +} + +float* MetalStatePtr(MetalSim* sim) { + return (float*)[sim->svBuffer contents]; +} + +int MetalNumAmps(MetalSim* sim) { + return sim->numAmps; +} + +void MetalResetState(MetalSim* sim) { + float *ptr = (float*)[sim->svBuffer contents]; + memset(ptr, 0, (size_t)sim->numAmps * 2 * sizeof(float)); + ptr[0] = 1.0f; +} + +int MetalBeginPass(MetalSim* sim, char** errOut) { + @autoreleasepool { + id buf = [sim->queue commandBuffer]; + if (!buf) { + *errOut = strdup("failed to create command buffer"); + return -1; + } + sim->cmdBuf = [buf retain]; + + id enc = [sim->cmdBuf computeCommandEncoder]; + if (!enc) { + *errOut = strdup("failed to create compute encoder"); + [sim->cmdBuf release]; + sim->cmdBuf = nil; + return -1; + } + sim->encoder = [enc retain]; + } + return 0; +} + +int MetalGate1Q(MetalSim* sim, uint32_t qubit, const float* matrix, + char** errOut) { + (void)errOut; + Gate1QParams params; + params.qubit = qubit; + params.nAmps = (uint32_t)sim->numAmps; + memcpy(params.m, matrix, 8 * sizeof(float)); + + [sim->encoder setComputePipelineState:sim->pipe1q]; + [sim->encoder setBuffer:sim->svBuffer offset:0 atIndex:0]; + [sim->encoder setBytes:¶ms length:sizeof(params) atIndex:1]; + + NSUInteger numThreads = (NSUInteger)sim->numAmps / 2; + NSUInteger tgSize = sim->tgSize1q; + if (tgSize > numThreads) tgSize = numThreads; + if (tgSize < 1) tgSize = 1; + NSUInteger numGroups = (numThreads + tgSize - 1) / tgSize; + + [sim->encoder dispatchThreadgroups:MTLSizeMake(numGroups, 1, 1) + threadsPerThreadgroup:MTLSizeMake(tgSize, 1, 1)]; + [sim->encoder memoryBarrierWithScope:MTLBarrierScopeBuffers]; + return 0; +} + +int MetalGate2Q(MetalSim* sim, uint32_t qubit0, uint32_t qubit1, + const float* matrix, char** errOut) { + (void)errOut; + Gate2QParams params; + params.qubit0 = qubit0; + params.qubit1 = qubit1; + params.nAmps = (uint32_t)sim->numAmps; + memcpy(params.m, matrix, 32 * sizeof(float)); + + [sim->encoder setComputePipelineState:sim->pipe2q]; + [sim->encoder setBuffer:sim->svBuffer offset:0 atIndex:0]; + [sim->encoder setBytes:¶ms length:sizeof(params) atIndex:1]; + + NSUInteger numThreads = (NSUInteger)sim->numAmps / 4; + NSUInteger tgSize = sim->tgSize2q; + if (tgSize > numThreads) tgSize = numThreads; + if (tgSize < 1) tgSize = 1; + NSUInteger numGroups = (numThreads + tgSize - 1) / tgSize; + + [sim->encoder dispatchThreadgroups:MTLSizeMake(numGroups, 1, 1) + threadsPerThreadgroup:MTLSizeMake(tgSize, 1, 1)]; + [sim->encoder memoryBarrierWithScope:MTLBarrierScopeBuffers]; + return 0; +} + +int MetalEndPass(MetalSim* sim, char** errOut) { + @autoreleasepool { + [sim->encoder endEncoding]; + [sim->encoder release]; + sim->encoder = nil; + + [sim->cmdBuf commit]; + [sim->cmdBuf waitUntilCompleted]; + + if ([sim->cmdBuf status] == MTLCommandBufferStatusError) { + NSError *error = [sim->cmdBuf error]; + *errOut = errdup(error); + [sim->cmdBuf release]; + sim->cmdBuf = nil; + return -1; + } + + [sim->cmdBuf release]; + sim->cmdBuf = nil; + } + return 0; +} diff --git a/sim/gpu/metal/sim.go b/sim/gpu/metal/sim.go new file mode 100644 index 0000000..0577f77 --- /dev/null +++ b/sim/gpu/metal/sim.go @@ -0,0 +1,55 @@ +// Package metal provides a GPU-accelerated statevector simulator using Apple Metal compute shaders. +// +// The simulator satisfies [sim.Simulator] and can be used as a drop-in replacement +// for the CPU statevector simulator via [backend/local.WithSimulator]: +// +// import gpumetal "github.com/splch/goqu/sim/gpu/metal" +// local.New(local.WithSimulator(gpumetal.NewSimulator)) +// +// Build requires the metal build tag and macOS/Apple Silicon. +// Without the tag, stub implementations return an error. +package metal + +import ( + "github.com/splch/goqu/circuit/ir" + "github.com/splch/goqu/sim" +) + +var _ sim.Simulator = (*Sim)(nil) + +// Sim is a GPU-accelerated statevector simulator backed by Apple Metal compute shaders. +// On Apple Silicon, unified memory eliminates explicit CPU-GPU transfers. +type Sim struct { + numQubits int + device metalDevice +} + +// NewSimulator is a [sim.SimFactory] that creates a Metal GPU simulator. +func NewSimulator(numQubits int) (sim.Simulator, error) { + return New(numQubits) +} + +// New creates a Metal GPU simulator initialized to |0...0>. +func New(numQubits int) (*Sim, error) { + return newSim(numQubits) +} + +// Run executes the circuit and returns measurement counts. +func (s *Sim) Run(c *ir.Circuit, shots int) (map[string]int, error) { + return run(s, c, shots) +} + +// Evolve applies all gate operations without measuring. +func (s *Sim) Evolve(c *ir.Circuit) error { + return evolve(s, c) +} + +// StateVector returns a copy of the current state vector. +func (s *Sim) StateVector() []complex128 { + return stateVector(s) +} + +// Close releases Metal device resources. +func (s *Sim) Close() error { + return closeSim(s) +} diff --git a/sim/gpu/metal/sim_test.go b/sim/gpu/metal/sim_test.go new file mode 100644 index 0000000..7484a60 --- /dev/null +++ b/sim/gpu/metal/sim_test.go @@ -0,0 +1,172 @@ +//go:build darwin + +package metal + +import ( + "math" + "math/cmplx" + "testing" + + "github.com/splch/goqu/circuit/builder" + "github.com/splch/goqu/sim/statevector" +) + +// Metal uses float32 precision; allow ~1e-5 tolerance. +const eps = 1e-5 + +func TestBellState(t *testing.T) { + c, err := builder.New("bell", 2). + H(0). + CNOT(0, 1). + Build() + if err != nil { + t.Fatal(err) + } + + sim, err := New(2) + if err != nil { + t.Fatal(err) + } + defer sim.Close() + + if err := sim.Evolve(c); err != nil { + t.Fatal(err) + } + + sv := sim.StateVector() + s2 := 1.0 / math.Sqrt2 + want := []complex128{complex(s2, 0), 0, 0, complex(s2, 0)} + assertStateClose(t, sv, want) +} + +func TestCrossValidation(t *testing.T) { + bld := builder.New("cross", 4) + bld.H(0) + bld.CNOT(0, 1) + bld.H(2) + bld.CNOT(2, 3) + c, err := bld.Build() + if err != nil { + t.Fatal(err) + } + + cpuSim := statevector.New(4) + if err := cpuSim.Evolve(c); err != nil { + t.Fatal(err) + } + cpuSV := cpuSim.StateVector() + + gpuSim, err := New(4) + if err != nil { + t.Fatal(err) + } + defer gpuSim.Close() + + if err := gpuSim.Evolve(c); err != nil { + t.Fatal(err) + } + gpuSV := gpuSim.StateVector() + + assertStateClose(t, gpuSV, cpuSV) +} + +func TestReversedQubits(t *testing.T) { + // CNOT with control > target to test qubit-swap path. + c, err := builder.New("rev", 3). + H(2). + CNOT(2, 0). + Build() + if err != nil { + t.Fatal(err) + } + + cpuSim := statevector.New(3) + if err := cpuSim.Evolve(c); err != nil { + t.Fatal(err) + } + + gpuSim, err := New(3) + if err != nil { + t.Fatal(err) + } + defer gpuSim.Close() + + if err := gpuSim.Evolve(c); err != nil { + t.Fatal(err) + } + + assertStateClose(t, gpuSim.StateVector(), cpuSim.StateVector()) +} + +func TestRun(t *testing.T) { + c, err := builder.New("bell", 2). + H(0). + CNOT(0, 1). + Build() + if err != nil { + t.Fatal(err) + } + + sim, err := New(2) + if err != nil { + t.Fatal(err) + } + defer sim.Close() + + counts, err := sim.Run(c, 1000) + if err != nil { + t.Fatal(err) + } + + // Bell state should only produce "00" and "11". + for bs := range counts { + if bs != "00" && bs != "11" { + t.Errorf("unexpected bitstring %q", bs) + } + } + if counts["00"]+counts["11"] != 1000 { + t.Errorf("total shots = %d, want 1000", counts["00"]+counts["11"]) + } +} + +func TestGHZ(t *testing.T) { + // 5-qubit GHZ state: (|00000> + |11111>) / sqrt(2) + bld := builder.New("ghz", 5) + bld.H(0) + for i := range 4 { + bld.CNOT(i, i+1) + } + c, err := bld.Build() + if err != nil { + t.Fatal(err) + } + + cpuSim := statevector.New(5) + if err := cpuSim.Evolve(c); err != nil { + t.Fatal(err) + } + + gpuSim, err := New(5) + if err != nil { + t.Fatal(err) + } + defer gpuSim.Close() + + if err := gpuSim.Evolve(c); err != nil { + t.Fatal(err) + } + + assertStateClose(t, gpuSim.StateVector(), cpuSim.StateVector()) +} + +func assertStateClose(t *testing.T, got, want []complex128) { + t.Helper() + if len(got) != len(want) { + t.Fatalf("state size %d, want %d", len(got), len(want)) + } + for i := range got { + if cmplx.Abs(got[i]-want[i]) > eps { + t.Errorf("state[%d] = %v, want %v", i, got[i], want[i]) + } + } +} diff --git a/sim/sim.go b/sim/sim.go new file mode 100644 index 0000000..72367a5 --- /dev/null +++ b/sim/sim.go @@ -0,0 +1,22 @@ +// Package sim defines the Simulator interface satisfied by all backends +// (CPU statevector, density matrix, GPU, etc.). +package sim + +import "github.com/splch/goqu/circuit/ir" + +// Simulator is the common interface for quantum circuit simulators. +// Both CPU and GPU implementations satisfy this interface. +type Simulator interface { + // Run executes the circuit for the given number of shots and returns measurement counts. + Run(c *ir.Circuit, shots int) (map[string]int, error) + + // Evolve applies all gate operations without measuring, leaving the state accessible. + Evolve(c *ir.Circuit) error + + // StateVector returns a copy of the current state vector. + StateVector() []complex128 + + // Close releases any resources held by the simulator. + // For CPU simulators this is a no-op; GPU simulators free device memory. + Close() error +} diff --git a/sim/statevector/sim.go b/sim/statevector/sim.go index 58ef253..4f017ce 100644 --- a/sim/statevector/sim.go +++ b/sim/statevector/sim.go @@ -11,9 +11,12 @@ import ( "github.com/splch/goqu/circuit/gate" "github.com/splch/goqu/circuit/ir" + "github.com/splch/goqu/sim" "github.com/splch/goqu/sim/pauli" ) +var _ sim.Simulator = (*Sim)(nil) + // Sim simulates a circuit via full statevector evolution. type Sim struct { numQubits int @@ -61,6 +64,9 @@ func (s *Sim) StateVector() []complex128 { return out } +// Close is a no-op for the CPU statevector simulator. It satisfies sim.Simulator. +func (s *Sim) Close() error { return nil } + // Evolve applies all gate operations without measuring, leaving the statevector accessible. func (s *Sim) Evolve(c *ir.Circuit) error { if c.NumQubits() != s.numQubits {