Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 85 additions & 0 deletions extensions/native/circuit/cuda/include/native/sumcheck.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
#pragma once

#include "primitives/constants.h"
#include "system/memory/offline_checker.cuh"

using namespace native;

template <typename T> struct HeaderSpecificCols {
T pc;
T registers[5];
MemoryReadAuxCols<T> read_records[7];
MemoryWriteAuxCols<T, EXT_DEG> write_records;
};

template <typename T> struct ProdSpecificCols {
T data_ptr;
T p[EXT_DEG * 2];
MemoryReadAuxCols<T> read_records[2];
T p_evals[EXT_DEG];
MemoryWriteAuxCols<T, EXT_DEG> write_record;
T eval_rlc[EXT_DEG];
};

template <typename T> struct LogupSpecificCols {
T data_ptr;
T pq[EXT_DEG * 4];
MemoryReadAuxCols<T> read_records[2];
T p_evals[EXT_DEG];
T q_evals[EXT_DEG];
MemoryWriteAuxCols<T, EXT_DEG> write_records[2];
T eval_rlc[EXT_DEG];
};

template <typename T> constexpr T constexpr_max(T a, T b) {
return a > b ? a : b;
}

constexpr size_t COL_SPECIFIC_WIDTH = constexpr_max(
sizeof(HeaderSpecificCols<uint8_t>),
constexpr_max(sizeof(ProdSpecificCols<uint8_t>), sizeof(LogupSpecificCols<uint8_t>))
);

template <typename T> struct NativeSumcheckCols {
T header_row;
T prod_row;
T logup_row;
T is_end;

T prod_continued;
T logup_continued;

T prod_in_round_evaluation;
T prod_next_round_evaluation;
T logup_in_round_evaluation;
T logup_next_round_evaluation;

T prod_acc;
T logup_acc;

T first_timestamp;
T start_timestamp;
T last_timestamp;

T register_ptrs[5];

T ctx[EXT_DEG * 2];

T prod_nested_len;
T logup_nested_len;

T curr_prod_n;
T curr_logup_n;

T alpha[EXT_DEG];
T challenges[EXT_DEG * 4];

T max_round;
T within_round_limit;
T should_acc;

T eval_acc[EXT_DEG];

T specific[COL_SPECIFIC_WIDTH];
};

13 changes: 13 additions & 0 deletions extensions/native/circuit/cuda/include/native/utils.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#pragma once

#include "primitives/trace_access.h"
#include "system/memory/controller.cuh"

__device__ __forceinline__ void mem_fill_base(
MemoryAuxColsFactory &mem_helper,
uint32_t timestamp,
RowSlice base_aux
) {
uint32_t prev = base_aux[COL_INDEX(MemoryBaseAuxCols, prev_timestamp)].asUInt32();
mem_helper.fill(base_aux, prev, timestamp);
}
10 changes: 1 addition & 9 deletions extensions/native/circuit/cuda/src/poseidon2.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#include "poseidon2-air/columns.cuh"
#include "poseidon2-air/params.cuh"
#include "poseidon2-air/tracegen.cuh"
#include "native/utils.cuh"
#include "primitives/trace_access.h"
#include "system/memory/controller.cuh"

Expand Down Expand Up @@ -38,15 +39,6 @@ template <typename T, size_t SBOX_REGISTERS> struct NativePoseidon2Cols {
T specific[COL_SPECIFIC_WIDTH];
};

__device__ void mem_fill_base(
MemoryAuxColsFactory &mem_helper,
uint32_t timestamp,
RowSlice base_aux
) {
uint32_t prev = base_aux[COL_INDEX(MemoryBaseAuxCols, prev_timestamp)].asUInt32();
mem_helper.fill(base_aux, prev, timestamp);
}

template <size_t SBOX_REGISTERS> struct Poseidon2Wrapper {
template <typename T> using Cols = NativePoseidon2Cols<T, SBOX_REGISTERS>;
using Poseidon2Row =
Expand Down
126 changes: 126 additions & 0 deletions extensions/native/circuit/cuda/src/sumcheck.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
#include "launcher.cuh"
#include "native/sumcheck.cuh"
#include "native/utils.cuh"
#include "primitives/trace_access.h"
#include "system/memory/controller.cuh"

using namespace native;

__device__ void fill_sumcheck_specific(RowSlice row, MemoryAuxColsFactory &mem_helper) {
RowSlice specific = row.slice_from(COL_INDEX(NativeSumcheckCols, specific));
uint32_t start_timestamp = row[COL_INDEX(NativeSumcheckCols, start_timestamp)].asUInt32();

if (row[COL_INDEX(NativeSumcheckCols, header_row)] == Fp::one()) {
for (uint32_t i = 0; i < 7; ++i) {
mem_fill_base(
mem_helper,
start_timestamp + i,
specific.slice_from(COL_INDEX(HeaderSpecificCols, read_records[i].base))
);
}
uint32_t last_timestamp = row[COL_INDEX(NativeSumcheckCols, last_timestamp)].asUInt32();
mem_fill_base(
mem_helper,
last_timestamp - 1,
specific.slice_from(COL_INDEX(HeaderSpecificCols, write_records.base))
);
} else if (row[COL_INDEX(NativeSumcheckCols, prod_row)] == Fp::one()) {
mem_fill_base(
mem_helper,
start_timestamp,
specific.slice_from(COL_INDEX(ProdSpecificCols, read_records[0].base))
);
if (row[COL_INDEX(NativeSumcheckCols, within_round_limit)] == Fp::one()) {
mem_fill_base(
mem_helper,
start_timestamp + 1,
specific.slice_from(COL_INDEX(ProdSpecificCols, read_records[1].base))
);
mem_fill_base(
mem_helper,
start_timestamp + 2,
specific.slice_from(COL_INDEX(ProdSpecificCols, write_record.base))
);
}
} else if (row[COL_INDEX(NativeSumcheckCols, logup_row)] == Fp::one()) {
mem_fill_base(
mem_helper,
start_timestamp,
specific.slice_from(COL_INDEX(LogupSpecificCols, read_records[0].base))
);
if (row[COL_INDEX(NativeSumcheckCols, within_round_limit)] == Fp::one()) {
mem_fill_base(
mem_helper,
start_timestamp + 1,
specific.slice_from(COL_INDEX(LogupSpecificCols, read_records[1].base))
);
mem_fill_base(
mem_helper,
start_timestamp + 2,
specific.slice_from(COL_INDEX(LogupSpecificCols, write_records[0].base))
);
mem_fill_base(
mem_helper,
start_timestamp + 3,
specific.slice_from(COL_INDEX(LogupSpecificCols, write_records[1].base))
);
}
}
}

__global__ void native_sumcheck_tracegen(
Fp *trace,
size_t height,
size_t width,
const Fp *records,
size_t rows_used,
uint32_t *range_checker_ptr,
uint32_t range_checker_num_bins,
uint32_t timestamp_max_bits
) {
uint32_t idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= height) {
return;
}

RowSlice row(trace + idx, height);
if (idx < rows_used) {
const Fp *record = records + idx * width;
for (uint32_t col = 0; col < width; ++col) {
row[col] = record[col];
}
MemoryAuxColsFactory mem_helper(
VariableRangeChecker(range_checker_ptr, range_checker_num_bins), timestamp_max_bits
);
fill_sumcheck_specific(row, mem_helper);
} else {
row.fill_zero(0, width);
COL_WRITE_VALUE(row, NativeSumcheckCols, is_end, Fp::one());
}
}

extern "C" int _native_sumcheck_tracegen(
Fp *d_trace,
size_t height,
size_t width,
const Fp *d_records,
size_t rows_used,
uint32_t *d_range_checker,
uint32_t range_checker_num_bins,
uint32_t timestamp_max_bits
) {
assert((height & (height - 1)) == 0);
assert(width == sizeof(NativeSumcheckCols<uint8_t>));
auto [grid, block] = kernel_launch_params(height);
native_sumcheck_tracegen<<<grid, block>>>(
d_trace,
height,
width,
d_records,
rows_used,
d_range_checker,
range_checker_num_bins,
timestamp_max_bits
);
return CHECK_KERNEL();
}
38 changes: 38 additions & 0 deletions extensions/native/circuit/src/cuda_abi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,44 @@ pub mod poseidon2_cuda {
}
}

pub mod sumcheck_cuda {
use super::*;

extern "C" {
pub fn _native_sumcheck_tracegen(
d_trace: *mut F,
height: usize,
width: usize,
d_records: *const F,
rows_used: usize,
d_range_checker: *mut u32,
range_checker_max_bins: u32,
timestamp_max_bits: u32,
) -> i32;
}

pub unsafe fn tracegen(
d_trace: &DeviceBuffer<F>,
height: usize,
width: usize,
d_records: &DeviceBuffer<F>,
rows_used: usize,
d_range_checker: &DeviceBuffer<F>,
timestamp_max_bits: u32,
) -> Result<(), CudaError> {
CudaError::from_result(_native_sumcheck_tracegen(
d_trace.as_mut_ptr(),
height,
width,
d_records.as_ptr(),
rows_used,
d_range_checker.as_mut_ptr() as *mut u32,
d_range_checker.len() as u32,
timestamp_max_bits,
))
}
}

pub mod native_loadstore_cuda {
use super::*;

Expand Down
5 changes: 5 additions & 0 deletions extensions/native/circuit/src/extension/cuda.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ use crate::{
jal_rangecheck::{JalRangeCheckAir, JalRangeCheckGpu},
loadstore::{NativeLoadStoreAir, NativeLoadStoreChipGpu},
poseidon2::{air::NativePoseidon2Air, NativePoseidon2ChipGpu},
sumcheck::{air::NativeSumcheckAir, NativeSumcheckChipGpu},
CastFExtension, GpuBackend, Native,
};

Expand Down Expand Up @@ -75,6 +76,10 @@ impl VmProverExtension<GpuBabyBearPoseidon2Engine, DenseRecordArena, Native>
let poseidon2 = NativePoseidon2ChipGpu::<1>::new(range_checker.clone(), timestamp_max_bits);
inventory.add_executor_chip(poseidon2);

inventory.next_air::<NativeSumcheckAir>()?;
let sumcheck = NativeSumcheckChipGpu::new(range_checker.clone(), timestamp_max_bits);
inventory.add_executor_chip(sumcheck);

Ok(())
}
}
Expand Down
14 changes: 12 additions & 2 deletions extensions/native/circuit/src/sumcheck/chip.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ use std::borrow::BorrowMut;
use openvm_circuit::{
arch::{
CustomBorrow, ExecutionError, MultiRowLayout, MultiRowMetadata, PreflightExecutor,
RecordArena, TraceFiller, VmChipWrapper, VmStateMut,
RecordArena, SizedRecord, TraceFiller, VmChipWrapper, VmStateMut,
},
system::{
memory::{online::TracingMemory, MemoryAuxColsFactory},
Expand Down Expand Up @@ -76,14 +76,24 @@ impl<'a, F: PrimeField32>
// Each instruction record consists solely of some number of contiguously
// stored NativeSumcheckCols<...> structs, each of which corresponds to a
// single trace row. Trace fillers don't actually need to know how many rows
// each instruction uses, and can thus treat each NativePoseidon2Cols<...>
// each instruction uses, and can thus treat each NativeSumcheckCols<...>
// as a single record.
NativeSumcheckRecordLayout {
metadata: NativeSumcheckMetadata { num_rows: 1 },
}
}
}

impl<F: PrimeField32> SizedRecord<NativeSumcheckRecordLayout> for NativeSumcheckRecordMut<'_, F> {
fn size(layout: &NativeSumcheckRecordLayout) -> usize {
layout.metadata.num_rows * size_of::<NativeSumcheckCols<F>>()
}

fn alignment(_layout: &NativeSumcheckRecordLayout) -> usize {
align_of::<NativeSumcheckCols<F>>()
}
}

#[derive(derive_new::new, Copy, Clone)]
pub struct NativeSumcheckExecutor;

Expand Down
Loading
Loading