From b6f41e242c8e02b71ae59fa01970eaa7735f6c79 Mon Sep 17 00:00:00 2001
From: Georg Wiese <georgwiese@gmail.com>
Date: Mon, 8 Jul 2024 17:20:39 +0200
Subject: [PATCH] Add `PoseidonGLMemory` machine (#1525)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Implements #1055 for the Poseidon machines. Pulled out of #1508.

Specifically, this PR adds a new `PoseidonGLMemory` machine which
receives 2 memory points and then:
- Reads 24 32-Bit words and packs them into 12 field elements
- Computes the Poseidon permutation (just like `PoseidonGL`)
- For each of the 4 output field elements, it:
- Invokes the `SplitGL` machine to get the canonical `u64`
representation
  - Writes the 8 32-Bit words to memory at the provided memory pointer

The read and write memory regions can even overlap! 🎉

This should simplify our RISC-V machine, as the syscall already expects
two memory pointers. We can simply pass it to the machine directly.

I started doing that in #1533, but I think it makes sense to wait until
#1443 is merged.

To test:
```
cargo run -r pil test_data/std/poseidon_gl_test.asm -o output -f --export-csv --prove-with estark-starky
```

I recommend reviewing the diff between
`std/machines/hash/poseidon_gl.asm` and
`std/machines/hash/poseidon_gl_memory.asm`

### Discussion

The overhead of the memory read / write is quite high (18 extra witness
columns, see [this
comment](https://github.com/powdr-labs/powdr/blob/40bdca4368c3accccb753aa35ac1027ccb8def0e/std/machines/hash/poseidon_gl_memory.asm#L13-L23),
mostly because we now need to have the input available in all rows
(which previously was only the case for the outputs). If we had offsets
other than 0 and 1, this could be avoided. Doing 24 parallel memory
reads in the first row would *not* help, because we'd have to add 24
witness columns (instead of 2 now) to store the result of the memory
operation.

A few more notes:
- With Vadcop, 18 extra witness columns in a secondary machine is *a
lot* better than introducing more registers (either "regular" registers
or assignment registers) in the main machine
- As mentioned
[here](https://github.com/powdr-labs/powdr/blob/40bdca4368c3accccb753aa35ac1027ccb8def0e/std/machines/hash/poseidon_gl_memory.asm#L111-L113),
we could get rid of two permutations if either:
- We were able to express explicitly that we want to call at most one
operation in the current row, or
- We had an optimizer that would be smart enough to batch the memory
reads and writes.
- We could also have just 1 read or write at a time (instead of 2), but
we'd have to increase the block size from 31 to 32 and the
implementation would be more complicated.
- We could also store the full final state of the Poseidon permutation,
instead of just the first 4 elements. This would need 8 more witness
columns to make the entire output available in all rows. Then, one could
use the machine to implement a Poseidon sponge, instead of.
- Looking at the bootloader, maybe it makes sense to pass 3 input
pointers instead of 1: One for the first 4 elements, one for the next 4,
and one for the capacity (often just a constant). For example, when
computing a Merkle root, you'd pass pointers for the two children hashes
and a pointer to the capacity constant.
---
 pipeline/tests/powdr_std.rs               |   7 +
 std/machines/hash/mod.asm                 |   1 +
 std/machines/hash/poseidon_bn254.asm      |   3 -
 std/machines/hash/poseidon_gl.asm         |   1 -
 std/machines/hash/poseidon_gl_memory.asm  | 198 ++++++++++++++++++++++
 test_data/std/poseidon_gl_memory_test.asm | 156 +++++++++++++++++
 6 files changed, 362 insertions(+), 4 deletions(-)
 create mode 100644 std/machines/hash/poseidon_gl_memory.asm
 create mode 100644 test_data/std/poseidon_gl_memory_test.asm

diff --git a/pipeline/tests/powdr_std.rs b/pipeline/tests/powdr_std.rs
index 96bbc47c2..a227dd015 100644
--- a/pipeline/tests/powdr_std.rs
+++ b/pipeline/tests/powdr_std.rs
@@ -32,6 +32,13 @@ fn poseidon_gl_test() {
     gen_estark_proof(f, Default::default());
 }
 
+#[test]
+fn poseidon_gl_memory_test() {
+    let f = "std/poseidon_gl_memory_test.asm";
+    verify_test_file(f, Default::default(), vec![]).unwrap();
+    gen_estark_proof(f, Default::default());
+}
+
 #[test]
 fn split_bn254_test() {
     let f = "std/split_bn254_test.asm";
diff --git a/std/machines/hash/mod.asm b/std/machines/hash/mod.asm
index b8837dd7f..f173cc68a 100644
--- a/std/machines/hash/mod.asm
+++ b/std/machines/hash/mod.asm
@@ -1,2 +1,3 @@
 mod poseidon_bn254;
 mod poseidon_gl;
+mod poseidon_gl_memory;
diff --git a/std/machines/hash/poseidon_bn254.asm b/std/machines/hash/poseidon_bn254.asm
index 0f654b04f..21e83c583 100644
--- a/std/machines/hash/poseidon_bn254.asm
+++ b/std/machines/hash/poseidon_bn254.asm
@@ -2,9 +2,6 @@ use std::array;
 use std::utils::unchanged_until;
 
 // Implements the Poseidon permutation for the BN254 curve.
-// Note that this relies on the trace table being non-wrapping, so it will
-// only work with the Halo2 backend (which is the only backend that supports
-// the BN254 curve).
 machine PoseidonBN254 with
     latch: FIRSTBLOCK,
     operation_id: operation_id,
diff --git a/std/machines/hash/poseidon_gl.asm b/std/machines/hash/poseidon_gl.asm
index 89983d3a1..be7b8bc57 100644
--- a/std/machines/hash/poseidon_gl.asm
+++ b/std/machines/hash/poseidon_gl.asm
@@ -32,7 +32,6 @@ machine PoseidonGL with
     let PARTIAL_ROUNDS: int = 22;
     let ROWS_PER_HASH = FULL_ROUNDS + PARTIAL_ROUNDS + 1;
 
-    pol constant L0 = [1] + [0]*;
     pol constant FIRSTBLOCK(i) { if i % ROWS_PER_HASH == 0 { 1 } else { 0 } };
     pol constant LASTBLOCK(i) { if i % ROWS_PER_HASH == ROWS_PER_HASH - 1 { 1 } else { 0 } };
     // Like LASTBLOCK, but also 1 in the last row of the table
diff --git a/std/machines/hash/poseidon_gl_memory.asm b/std/machines/hash/poseidon_gl_memory.asm
new file mode 100644
index 000000000..14a3ec490
--- /dev/null
+++ b/std/machines/hash/poseidon_gl_memory.asm
@@ -0,0 +1,198 @@
+use std::array;
+use std::check::assert;
+use std::utils::unchanged_until;
+use std::utils::force_bool;
+use std::utils::sum;
+use std::convert::expr;
+use std::machines::memory::Memory;
+use std::machines::split::split_gl::SplitGL;
+
+// Implements the Poseidon permutation for the Goldilocks field.
+// This version of the Poseidon machine receives memory pointers and interacts
+// with memory directly to fetch its inputs and write its outputs.
+// In comparison to std::machines::hash::poseidon_gl::PoseidonGL, this machine has:
+// - 18 extra witness columns:
+//   - 12 to make the input state available in all rows
+//   - 3 to make the time step, input address, and output address available in all rows
+//   - 2 to store the low and high words of the memory read
+//   - 1 to store whether a memory read should be done (could be removed if we use an intermediate polynomial, see below)
+// - 16 extra fixed columns to store a one-hot encoding of the row number (for the first 12 + 4 rows)
+// - 5 extra permutations:
+//   - 2 to read the low and high words from the memory
+//   - 2 to write the low and high words to the memory
+//   - 1 to split the current output into low and high words
+machine PoseidonGLMemory(mem: Memory, split_gl: SplitGL) with
+    latch: CLK_0,
+    operation_id: operation_id,
+    // Allow this machine to be connected via a permutation
+    call_selectors: sel,
+{
+
+    // Hashes 8 "rate" elements and 4 "capacity" elements to 4 field elements
+    // by applying the Poseidon permutation and returning the first 4 rate elements.
+    // When the hash function is used only once, the capacity elements should be
+    // set to constants, where different constants can be used to define different
+    // hash functions.
+    // The input data is passed via a memory pointer: The machine will read 24
+    // 32-Bit machine words, interpreted as 12 field elements stored in little-endian
+    // format.
+    // Similarly, the output data is written to memory at the provided pointer as
+    // 8 32-Bit machine words representing 4 field elements in little-endian format
+    // (in canonical form).
+    // Reads happen at the provided time step; writes happen at the next time step.
+    operation poseidon_permutation<0> input_addr, output_addr, time_step ->;
+
+    col witness operation_id;
+
+    // Ported from:
+    // - https://github.com/0xPolygonHermez/zkevm-proverjs/blob/main/pil/poseidong.pil
+    // - https://github.com/0xPolygonHermez/zkevm-proverjs/blob/main/src/sm/sm_poseidong.js
+
+    // Number of field elements in the state
+    let STATE_SIZE: int = 12;
+    // Number of output elements
+    let OUTPUT_SIZE: int = 4;
+    // Number of full rounds
+    let FULL_ROUNDS: int = 8;
+    // Number of partial rounds (half of them before and half of them after the full rounds)
+    let PARTIAL_ROUNDS: int = 22;
+    let ROWS_PER_HASH = FULL_ROUNDS + PARTIAL_ROUNDS + 1;
+
+
+    // ------------- Begin memory read / write ---------------
+
+    // Get an intermediate column that indicates that we're in an
+    // actual block, not a default block. Its value is constant
+    // within the block.
+    let used = array::sum(sel);
+    array::map(sel, |s| unchanged_until(s, LAST));
+    std::utils::force_bool(used);
+
+    // Repeat the input state in the whole block
+    col witness input[STATE_SIZE];
+    array::map(input, |c| unchanged_until(c, LAST));
+    array::zip(input, state, |i, s| CLK[0] * (i - s) = 0);
+
+    // Repeat the time step and input / output address in the whole block
+    col witness time_step;
+    col witness input_addr;
+    col witness output_addr;
+    unchanged_until(time_step, LAST);
+    unchanged_until(input_addr, LAST);
+    unchanged_until(output_addr, LAST);
+    
+    // One-hot encoding of the row number (for the first <STATE_SIZE + OUTPUT_SIZE> rows)
+    assert(STATE_SIZE + OUTPUT_SIZE < ROWS_PER_HASH, || "Not enough rows to do memory read / write");
+    let CLK: col[STATE_SIZE + OUTPUT_SIZE] = array::new(STATE_SIZE + OUTPUT_SIZE, |i| |row| if row % ROWS_PER_HASH == i { 1 } else { 0 });
+    let CLK_0 = CLK[0];
+
+    col witness word_low, word_high;
+
+    // Do *two* memory reads in each of the first STATE_SIZE rows
+    // For input i, we expect the low word at address input_addr + 8 * i and
+    // the high word at address input_addr + 8 * i + 4
+    // TODO: This could be an intermediate polynomial, but for some reason estark-starky
+    // fails then, so we keep it as a witness for now
+    let do_mload;
+    do_mload = used * sum(STATE_SIZE, |i| CLK[i]);
+    let input_index = sum(STATE_SIZE, |i| expr(i) * CLK[i]);
+    link if do_mload ~> word_low = mem.mload(input_addr + 8 * input_index, time_step);
+    link if do_mload ~> word_high = mem.mload(input_addr + 8 * input_index + 4, time_step);
+
+    // Combine the low and high words and write it into `input`
+    let current_input = array::sum(array::new(STATE_SIZE, |i| CLK[i] * input[i]));
+    do_mload * (word_low + word_high * 2**32 - current_input) = 0;
+
+    // Do *two* memory writes in each of the next OUTPUT_SIZE rows
+    // For output i, we write the low word at address output_addr + 8 * i and
+    // the high word at address output_addr + 8 * i + 4
+    let do_mstore = used * sum(OUTPUT_SIZE, |i| CLK[i + STATE_SIZE]);
+    let output_index = sum(OUTPUT_SIZE, |i| expr(i) * CLK[i + STATE_SIZE]);
+    // TODO: This translates to two additional permutations. But because they go to the same machine
+    // as the mloads above *and* never happen at the same time, they could actually be combined with
+    // the mload permutations. But there is currently no way to express this.
+    link if do_mstore ~> mem.mstore(output_addr + 8 * output_index, time_step + 1, word_low);
+    link if do_mstore ~> mem.mstore(output_addr + 8 * output_index + 4, time_step + 1, word_high);
+
+    // Make sure that in row i + STATE_SIZE, word_low and word_high correspond to output i
+    let current_output = array::sum(array::new(OUTPUT_SIZE, |i| CLK[i + STATE_SIZE] * output[i]));
+    link if do_mstore ~> (word_low, word_high) = split_gl.split(current_output);
+
+
+    // ------------- End memory read / write ---------------
+
+    pol constant LASTBLOCK(i) { if i % ROWS_PER_HASH == ROWS_PER_HASH - 1 { 1 } else { 0 } };
+    // Like LASTBLOCK, but also 1 in the last row of the table
+    // Specified this way because we can't access the degree in the match statement
+    pol constant LAST = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]* + [1];
+
+    // Whether the current round is a partial round
+    pol constant PARTIAL = [0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]*;
+    
+    // The round constants
+    pol constant C_0 = [0xb585f766f2144405, 0x86287821f722c881, 0xe9fa634a21de0082, 0x92a756e67e2b9413, 0x3cc3f892184df408, 0x7131aa45268d7d8c, 0x99ad1aab0814283b, 0xeb84f608da56ef48, 0x7159cd30c3ac118e, 0xdcef0797c2b69ec7, 0xd0762cbc8ca6570c, 0x30a4680593258387, 0x15a16a8a8322d458, 0x5a3f1bb1c53a9645, 0x775005982d74d7f7, 0xf9cc95c22b4c1fcc, 0xc49366bb25e8513, 0xdd611f1000c17442, 0x2ff876fa5ef97c4, 0x3d06c8bd1514e2d9, 0xe89cd854d5d01d33, 0xece5a71e0cfedc75, 0x90004c1371b893c5, 0xde122bebe9a39368, 0x4d61e56a525d225a, 0x1478d361dbbf9fac, 0x475cd3205a3bdcde, 0xe70201e960cb78b8, 0x7be5b9ffda905e1c, 0xf3c12fe54d5c653b, 0x0]*;
+    pol constant C_1 = [0x7746a55f43921ad7, 0x59cd1a8a41c18e55, 0xf56f6d48959a600d, 0x70e741ebfee96586, 0x2e479dc157bf31bb, 0x9351036095630f9f, 0x438a7c91d416ca4d, 0xda608834c40e603d, 0x839b4e8fafead540, 0x3d639263da827b13, 0x34c6efb812b04bf5, 0x337dc00c61bd9ce1, 0x388a128b7fd9a609, 0xdb7f023869fb8d38, 0x86ab99b4dde6c8b0, 0x8d37f755f4ae9f6, 0x784d3d2f1698309, 0xd8185f9adfea4fd0, 0xc5cb72a2a51159b0, 0x9c9c98207cb10767, 0x5cd377dc8bb882a2, 0x5ff98fd5d51fe610, 0xb932b7cf752e5545, 0x4d001fd58f002526, 0x262e963c8da05d3d, 0x6f02dc07d141875c, 0x18a42105c31b7e88, 0x6f90ff3b6a65f108, 0xa3c95eaec244aa30, 0x40b9e922ed9771e2, 0x0]*;
+    pol constant C_2 = [0xb2fb0d31cee799b4, 0xc3b919ad495dc574, 0xf7d713e806391165, 0x19d5ee2af82ec1c, 0x6f49de07a6234346, 0xad535b24afc26bfb, 0xb60de3bcc5ea751c, 0x8f97fe408061f183, 0xd3f3e5e82920adc, 0xe273fd971bc8d0e7, 0x40bf0ab5fa14c112, 0xd5eca244c7a4ff1d, 0x2300e5d6baedf0fb, 0xb462065911d4e1fc, 0xb1204f603f51c080, 0xeec49b613478675b, 0x530fb67ea1809a81, 0xef87139ca9a3ab1e, 0x8470f39d2d5c900e, 0x65700b51aedfb5ef, 0xa7b0fb7883eee860, 0x83e8941918964615, 0xa0b1df81b6fe59fc, 0xca6637000eb4a9f8, 0x59e89b094d220ec2, 0x296a202ed8e556a2, 0x23e7414af663068, 0x42747a7245e7fa84, 0x230bca8f4df0544, 0x551f5b0fbe7b1840, 0x0]*;
+    pol constant C_3 = [0xf6760a4803427d7, 0xa484c4c5ef6a0781, 0x8297132b32825daf, 0x6f6f2ed772466352, 0x213ce7bede378d7b, 0x4627f5c6993e44be, 0xc99cab6aef6f58bc, 0xa93f485c96f37b89, 0x8f7d83bddee7bba8, 0x418f02702d227ed5, 0xb6b570fc7c5740d3, 0x7762638264d279bd, 0x2f63aa8647e15104, 0x49c24ae4437d8030, 0xef61ac8470250ecf, 0xf143933aed25e0b0, 0x410492299bb01f49, 0x3ba71336c34ee133, 0x25abb3f1d39fcb76, 0x911f451539869408, 0x7684403ec392950d, 0x5922040b47f150c1, 0x8ef1dd26770af2c2, 0x2f2339d624f91f78, 0x55d5b52b78b9c5e, 0x2afd67999bf32ee5, 0x15147108121967d7, 0xd1f507e43ab749b2, 0x4135c2bebfe148c6, 0x25032aa7c4cb1811, 0x0]*;
+    pol constant C_4 = [0xe10d666650f4e012, 0x308bbd23dc5416cc, 0xad6805e0e30b2c8a, 0x7cf416cfe7e14ca1, 0x5b0431345d4dea83, 0x645cf794b8f1cc58, 0x69a5ed92a72ee4ff, 0x6704e8ee8f18d563, 0x780f2243ea071d06, 0x8c25fda3b503038c, 0x5a27b9002de33454, 0xc1e434bedeefd767, 0xf1c36ce86ecec269, 0xd793862c112b0566, 0x1bbcd90f132c603f, 0xe4c5dd8255dfc622, 0x139542347424b9ac, 0x7d3a455d56b70238, 0x23eb8cc9b372442f, 0x7ae6849fbc3a0ec6, 0x5fa3f06f4fed3b52, 0xf97d750e3dd94521, 0x541a4f9cfbeed35, 0x6d1a7918c80df518, 0x82b27eb33514ef99, 0x7acfd96efa95491d, 0xe4a3dff1d7d6fef9, 0x1c86d265f15750cd, 0x166fc0cc438a3c72, 0xaaed34074b164346, 0x0]*;
+    pol constant C_5 = [0x8cae14cb07d09bf1, 0x6e4a40c18f30c09c, 0xac51d9f5fcf8535e, 0x61df517b86a46439, 0xa2de45780344d6a1, 0x241c70ed0af61617, 0x5e7b329c1ed4ad71, 0xcee3e9ac1e072119, 0xeb915845f3de1634, 0x2cbaed4daec8c07c, 0xb1a5b165b6d2b2d2, 0x299351a53b8ec22, 0x27181125183970c9, 0xaadd1106730d8feb, 0xcd1dabd964db557, 0xe7ad7756f193198e, 0x9cb0bd5ea1a1115e, 0x660d32e130182684, 0xd687ba55c64f6364, 0x3bb340eba06afe7e, 0x8df57ac11bc04831, 0x5080d4c2b86f56d7, 0x9e61106178bfc530, 0xdf9a4939342308e9, 0xd30094ca96b7ce7b, 0x6798ba0c0abb2c6d, 0x1a8d1a588085737, 0x3996ce73dd832c1c, 0x3762b59a8ae83efa, 0x8ffd96bbf9c9c81d, 0x0]*;
+    pol constant C_6 = [0xd438539c95f63e9f, 0x9a2eedb70d8f8cfa, 0x502ad7dc18c2ad87, 0x85dc499b11d77b75, 0x7103aaf94a7bf308, 0xacb8e076647905f1, 0x5fc0ac0800144885, 0x510d0e65e2b470c1, 0xd19e120d26b6f386, 0x5f58e6afcdd6ddc2, 0x8722e0ace9d1be22, 0xb2d456e4ad251b80, 0xe584029370dca96d, 0xc43b6e0e97b0d568, 0x11a3ae5beb9d1ec9, 0x92c2318b87fff9cb, 0x2e3f615c38f49a1, 0x297a863f48cd1f43, 0xda8d9e90fd8ff158, 0xb46e9d8b682ea65e, 0x2db01efa1e1e1897, 0xa7de115b56c78d70, 0xb3767e80935d8af2, 0xebc2151ee6c8398c, 0xcf5cb381cd0a1535, 0x34c6f57b26c92122, 0x11b4c74eda62beef, 0x8e7fba02983224bd, 0xe8928a4c89114750, 0x70fc91eb5937085c, 0x0]*;
+    pol constant C_7 = [0xef781c7ce35b4c3d, 0xe360c6e0ae486f38, 0x57a1550c110b3041, 0x4b959b48b9c10733, 0x5326fc0d97279301, 0x3737e9db4c4f474d, 0x32db829239774eca, 0xf6323f486b9038f0, 0x16ee53a7e5fecc6, 0x284650ac5e1b0eba, 0x788ee3b37e5680fb, 0x3e9ed1fda49cea0b, 0x4d9bbc3e02f1cfb2, 0xe29024c18ee6fca2, 0xf755bfeea585d11d, 0x739c25f8fd73596d, 0x985d4f4a9c5291ef, 0x90e0a736a751ebb7, 0xe3cbdc7d2fe45ea7, 0x8dcf22f9a3b34356, 0x54846de4aadb9ca2, 0x6a9242ac87538194, 0x98d5782065af06, 0x3cc2ba8a1116515, 0xfeed4db6919e5a7c, 0x5736e1bad206b5de, 0xe587cc0d69a73346, 0xba0dec7103255dd4, 0x2a440b51a4945ee5, 0x7f795e2a5f915440, 0x0]*;
+    pol constant C_8 = [0xcdc4a239b0c44426, 0xd5c7718fbfc647fb, 0x66bbd30e6ce0e583, 0xe8be3e5da8043e57, 0xa9ceb74fec024747, 0xe7ea5e33e75fffb6, 0xade699c5830f310, 0xb508cdeffa5ceef, 0xcb5fd54e7933e477, 0x635b337ee819dab5, 0x14a726661551e284, 0x2972a92ba450bed8, 0xea35bc29692af6f8, 0x5e50c27535b88c66, 0xa3b83250268ea4d7, 0x5636cac9f16dfed0, 0x775b9feafdcd26e7, 0x549f80ce550c4fd3, 0xb9a8c9b3aee52297, 0x77bdaeda586257a7, 0xba6745385893c784, 0xf7856ef7f9173e44, 0x31d191cd5c1466c7, 0xd341d037e840cf83, 0x41703f53753be59f, 0x20057d2a0056521b, 0x1ff7327017aa2a6e, 0x9e9cbd781628fc5b, 0x80cefd2b7d99ff83, 0x4543d9df5476d3cb, 0x0]*;
+    pol constant C_9 = [0x277fa208bf337bff, 0xc35eae071903ff0b, 0xda2abef589d644e, 0xf5c0bc1de6da8699, 0x27f8ec88bb21b1a3, 0x90dee49fc9bfc23a, 0x7cc5583b10415f21, 0xf2417089e4fb3cbd, 0xacb8417879fd449f, 0x9f9a036ed4f2d49f, 0x98b7672f9ef3b419, 0x20216dd77be493de, 0x18e21b4beabb4137, 0x10383f20a4ff9a87, 0x516306f4927c93af, 0xdd8f909a938e0172, 0x304265a6384f0f2d, 0xf73b2922f38bd64, 0xc0d28a5c10960bd3, 0xf19e400a5104d20d, 0x541d496344d2c75b, 0x2265fc92feb0dc09, 0x410fefafa319ac9d, 0x387cb5d25af4afcc, 0x5eeea940fcde8b6f, 0x3dea5bd5d0578bd7, 0x594e29c42473d06b, 0xdae8645996edd6a5, 0xbb9879c6e61fd62a, 0xf172d73e004fc90d, 0x0]*;
+    pol constant C_10 = [0xe17653a29da578a1, 0x849c2656969c4be7, 0xf061274fdb150d61, 0x40b12cbf09ef74bf, 0xfceb4fda1ded0893, 0xd1b1edf76bc09c92, 0x85df9ed2e166d64f, 0x60e75c2890d15730, 0x9c22190be7f74732, 0xb93e260cae5c170e, 0xbb93ae776bb30e3a, 0xadffe8cf28449ec6, 0x1e3b9fc625b554f4, 0x38e8ee9d71a45af8, 0xddb4ac49c9efa1da, 0xc6401fe115063f5b, 0x593664c39773012c, 0x16bf1f73fb7a9c3f, 0x45d7ac9b68f71a34, 0xc368a348e46d950f, 0xe909678474e687fe, 0x17dfc8e4f7ba8a57, 0xbdf8f242e316c4ab, 0xbba2515f22909e87, 0x4cd1f1b175100206, 0x16e50d897d4634ac, 0xf6f31db1899b12d5, 0xdebe0853b1a1d378, 0x6e7c8f1a84265034, 0xdfd1c4febcc81238, 0x0]*;
+    pol constant C_11 = [0xc54302f225db2c76, 0xc0572c8c08cbbbad, 0x28b8ec3ae9c29633, 0xa637093ecb2ad631, 0xfac6ff1346a41675, 0xb65481ba645c602, 0x6604df4fee32bcb1, 0xa6217d8bf660f29c, 0x5d693c1ba3ba3621, 0xb0a7eae879ddb76d, 0x28fd3b046380f850, 0x1c4dbb1c4c27d243, 0x25d64362697828fd, 0xdd5118375bf1a9b9, 0x64bb6dec369d4418, 0x8ad97b33f1ac1455, 0x4f0a2e5fb028f2ce, 0x6d1f5a59005bec17, 0xeeb76e397069e804, 0x9ef1cd60e679f284, 0xdfe89923f6c9c2ff, 0x9001a64209f21db8, 0x9e8cd55b57637ed0, 0x7248fe7705f38e47, 0x4a20358574454ec0, 0x29bff3ecb9b7a6e3, 0xc02ac5e47312d3ca, 0xa49229d24d014343, 0x164bb2de1bbeddc8, 0xbc8dfb627fe558fc, 0x0]*;
+    let C = [C_0, C_1, C_2, C_3, C_4, C_5, C_6, C_7, C_8, C_9, C_10, C_11];
+
+    // State of the Poseidon permutation (8 rate elements and 4 capacity elements)
+    pol commit state[STATE_SIZE];
+
+    // The first OUTPUT_SIZE elements of the *final* state
+    // (constrained to be constant within the block and equal to parts of the state in the last row)
+    pol commit output[OUTPUT_SIZE];
+
+    // Add round constants
+    let a: expr[STATE_SIZE] = array::zip(state, C, |state, C| state + C);
+
+    // Compute S-Boxes (x^7)
+    let x2: expr[STATE_SIZE] = array::map(a, |a| a * a);
+    let x4: expr[STATE_SIZE] = array::map(x2, |x2| x2 * x2);
+    let x6: expr[STATE_SIZE] = array::zip(x4, x2, |x4, x2| x4 * x2);
+    let x7: expr[STATE_SIZE] = array::zip(x6, a, |x6, a| x6 * a);
+
+    // Apply S-Boxes on the first element and otherwise if it is a full round.
+    let b: expr[STATE_SIZE] = array::new(STATE_SIZE, |i| if i == 0 {
+        x7[i]
+    } else {
+        PARTIAL * (a[i] - x7[i]) + x7[i]
+    });
+
+    // The MDS matrix
+    let M = [
+        [25, 15, 41, 16,  2, 28, 13, 13, 39, 18, 34, 20],
+        [20, 17, 15, 41, 16,  2, 28, 13, 13, 39, 18, 34],
+        [34, 20, 17, 15, 41, 16,  2, 28, 13, 13, 39, 18],
+        [18, 34, 20, 17, 15, 41, 16,  2, 28, 13, 13, 39],
+        [39, 18, 34, 20, 17, 15, 41, 16,  2, 28, 13, 13],
+        [13, 39, 18, 34, 20, 17, 15, 41, 16,  2, 28, 13],
+        [13, 13, 39, 18, 34, 20, 17, 15, 41, 16,  2, 28],
+        [28, 13, 13, 39, 18, 34, 20, 17, 15, 41, 16,  2],
+        [ 2, 28, 13, 13, 39, 18, 34, 20, 17, 15, 41, 16],
+        [16,  2, 28, 13, 13, 39, 18, 34, 20, 17, 15, 41],
+        [41, 16,  2, 28, 13, 13, 39, 18, 34, 20, 17, 15],
+        [15, 41, 16,  2, 28, 13, 13, 39, 18, 34, 20, 17]
+    ];
+
+    // Multiply with MDS Matrix
+    let dot_product = |v1, v2| array::sum(array::zip(v1, v2, |v1_i, v2_i| v1_i * v2_i));
+    let c: expr[STATE_SIZE] = array::map(M, |M_row_i| dot_product(M_row_i, b));
+
+    // Copy c to state in the next row
+    array::zip(state, c, |state, c| (state' - c) * (1-LAST) = 0);
+
+    // In the last row, the first OUTPUT_SIZE elements of the state should equal output
+    array::zip(output, state, |output, state| LASTBLOCK * (output - state) = 0);
+
+    // The output should stay constant in the block
+    array::map(output, |c| unchanged_until(c, LAST));
+}
diff --git a/test_data/std/poseidon_gl_memory_test.asm b/test_data/std/poseidon_gl_memory_test.asm
new file mode 100644
index 000000000..0166ee70c
--- /dev/null
+++ b/test_data/std/poseidon_gl_memory_test.asm
@@ -0,0 +1,156 @@
+use std::machines::hash::poseidon_gl_memory::PoseidonGLMemory;
+use std::machines::memory::Memory;
+use std::machines::split::split_gl::SplitGL;
+
+machine Main with degree: 65536 {
+    reg pc[@pc];
+    reg X1[<=];
+    reg X2[<=];
+    reg X3[<=];
+    reg X4[<=];
+    reg X5[<=];
+    reg X6[<=];
+    reg X7[<=];
+    reg X8[<=];
+    reg X9[<=];
+    reg X10[<=];
+    reg X11[<=];
+    reg X12[<=];
+    reg ADDR1[<=];
+    reg ADDR2[<=];
+
+    SplitGL split;
+
+    // Increase the time step by 2 in each row, so that the poseidon machine
+    // can read in the given time step and write in the next time step.
+    col fixed STEP(i) { 2 * i };
+    Memory memory;
+    instr mstore_le ADDR1, X1, X2 ->
+        link ~> memory.mstore(ADDR1, STEP, X2)
+        link ~> memory.mstore(ADDR1 + 4, STEP, X1);
+
+    PoseidonGLMemory poseidon(memory, split);
+    instr poseidon ADDR1, ADDR2 -> link ~> poseidon.poseidon_permutation(ADDR1, ADDR2, STEP);
+
+    col witness val_low, val_high;
+    instr assert_eq ADDR1, X1 ->
+        link ~> val_low = memory.mload(ADDR1, STEP)
+        link ~> val_high = memory.mload(ADDR1 + 4, STEP)
+    {
+        val_low + 2**32 * val_high = X1
+    }
+
+    function main {
+
+        // See test vectors at:
+        // https://github.com/0xPolygonHermez/zkevm-testvectors/blob/main/poseidon/raw-hash.json
+
+        mstore_le 0, 0, 0;
+        mstore_le 8, 0, 0;
+        mstore_le 16, 0, 0;
+        mstore_le 24, 0, 0;
+        mstore_le 32, 0, 0;
+        mstore_le 40, 0, 0;
+        mstore_le 48, 0, 0;
+        mstore_le 56, 0, 0;
+        mstore_le 64, 0, 0;
+        mstore_le 72, 0, 0;
+        mstore_le 80, 0, 0;
+        mstore_le 88, 0, 0;
+ 
+        poseidon 0, 0;
+
+        assert_eq 0, 4330397376401421145;
+        assert_eq 8, 14124799381142128323;
+        assert_eq 16, 8742572140681234676;
+        assert_eq 24, 14345658006221440202;
+
+        mstore_le 0, 0, 1;
+        mstore_le 8, 0, 1;
+        mstore_le 16, 0, 1;
+        mstore_le 24, 0, 1;
+        mstore_le 32, 0, 1;
+        mstore_le 40, 0, 1;
+        mstore_le 48, 0, 1;
+        mstore_le 56, 0, 1;
+        mstore_le 64, 0, 1;
+        mstore_le 72, 0, 1;
+        mstore_le 80, 0, 1;
+        mstore_le 88, 0, 1;
+
+        poseidon 0, 0;
+
+        assert_eq 0, 16428316519797902711;
+        assert_eq 8, 13351830238340666928;
+        assert_eq 16, 682362844289978626;
+        assert_eq 24, 12150588177266359240;
+
+        // Pass test vector (-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1)
+        mstore_le 0, 0xffffffff, 0;
+        mstore_le 8, 0xffffffff, 0;
+        mstore_le 16, 0xffffffff, 0;
+        mstore_le 24, 0xffffffff, 0;
+        mstore_le 32, 0xffffffff, 0;
+        mstore_le 40, 0xffffffff, 0;
+        mstore_le 48, 0xffffffff, 0;
+        mstore_le 56, 0xffffffff, 0;
+        mstore_le 64, 0xffffffff, 0;
+        mstore_le 72, 0xffffffff, 0;
+        mstore_le 80, 0xffffffff, 0;
+        mstore_le 88, 0xffffffff, 0;
+
+        poseidon 0, 0;
+
+        assert_eq 0, 13691089994624172887;
+        assert_eq 8, 15662102337790434313;
+        assert_eq 16, 14940024623104903507;
+        assert_eq 24, 10772674582659927682;
+
+        // Pass test vector (923978, 235763497586, 9827635653498, 112870, 289273673480943876, 230295874986745876, 6254867324987, 2087, 0, 0, 0, 0)
+        // = (0xe194a, 0x36_e4997a72, 0x8f0_2cbb6b7a, 0x1b8e6, 0x403b4df_96b43904, 0x3322ce0_cdd2a014, 0x5b0_5325203b, 0x827, 0x0, 0x0, 0x0, 0x0)
+        mstore_le 0, 0, 0xe194a;
+        mstore_le 8, 0x36, 0xe4997a72;
+        mstore_le 16, 0x8f0, 0x2cbb6b7a;
+        mstore_le 24, 0, 0x1b8e6;
+        mstore_le 32, 0x403b4df, 0x96b43904;
+        mstore_le 40, 0x3322ce0, 0xcdd2a014;
+        mstore_le 48, 0x5b0, 0x5325203b;
+        mstore_le 56, 0, 0x827;
+        mstore_le 64, 0, 0;
+        mstore_le 72, 0, 0;
+        mstore_le 80, 0, 0;
+        mstore_le 88, 0, 0;
+
+        poseidon 0, 0;
+
+        assert_eq 0, 1892171027578617759;
+        assert_eq 8, 984732815927439256;
+        assert_eq 16, 7866041765487844082;
+        assert_eq 24, 8161503938059336191;
+
+        // Repeat the first test, but be fancy with the memory pointers being passed:
+
+        mstore_le 100, 0, 0;
+        mstore_le 108, 0, 0;
+        mstore_le 116, 0, 0;
+        mstore_le 124, 0, 0;
+        mstore_le 132, 0, 0;
+        mstore_le 140, 0, 0;
+        mstore_le 148, 0, 0;
+        mstore_le 156, 0, 0;
+        mstore_le 164, 0, 0;
+        mstore_le 172, 0, 0;
+        mstore_le 180, 0, 0;
+        mstore_le 188, 0, 0;
+ 
+        // This will read bytes [100, 191] and write the result to bytes [104, 131]
+        poseidon 100, 104;
+
+        assert_eq 104, 4330397376401421145;
+        assert_eq 112, 14124799381142128323;
+        assert_eq 120, 8742572140681234676;
+        assert_eq 128, 14345658006221440202;
+
+        return;
+    }
+}